def test_locate_files_list(self): paths = ['file1.csv', 'file2.xlsx'] config_dir = 'output' result = DataReader.locate_files(paths, config_dir) assert isinstance(result, list) eq_(result, [None, None])
def locate_custom_sections(custom_report_section_paths, config_dir): """ Get the absolute paths for custom report sections and check that the files exist. If a file does not exist, raise an exception. Parameters ---------- custom_report_section_paths : list of str List of paths to IPython notebook files representing the custom sections. config_dir : str Path to the experiment configuration file. Returns ------- custom_report_sections : list of str List of absolute paths to the custom section notebooks. Raises ------ FileNotFoundError If any of the files cannot be found. """ custom_report_sections = [] for cs_path in custom_report_section_paths: cs_location = DataReader.locate_files(cs_path, config_dir) if not cs_location: raise FileNotFoundError("Error: custom section not found at " "{}.".format(cs_path)) else: custom_report_sections.append(cs_location) return custom_report_sections
def check_experiment_dir(experiment_dir, experiment_name, configpath): """ Check that the supplied experiment directory exists and contains the output of the rsmtool experiment. Parameters ---------- experiment_dir : str Supplied path to the experiment_dir. configpath : str Path to the directory containing the configuration file. Returns ------- jsons : list A list paths to all configuration json files contained in the output directory Raises ------ FileNotFoundError If the directory does not exist or does not contain and output of an RSMTool experiment. """ full_path_experiment_dir = DataReader.locate_files(experiment_dir, configpath) if not full_path_experiment_dir: raise FileNotFoundError("The directory {} " "does not exist.".format(experiment_dir)) else: # check that there is an output directory csvdir = normpath(join(full_path_experiment_dir, 'output')) if not exists(csvdir): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(full_path_experiment_dir)) # find the json configuration files for all experiments stored in this directory jsons = glob.glob(join(csvdir, '*.json')) if len(jsons) == 0: raise FileNotFoundError("The directory {} does not contain " "the .json configuration files for rsmtool " "experiments.".format(full_path_experiment_dir)) # Raise an error if the user specified a list of experiment names # but we found several .jsons in the same directory if experiment_name and len(jsons) > 1: raise ValueError("{} seems to contain the output of multiple experiments. " "In order to use custom experiment names, you must have " "a separate directory " "for each experiment".format(full_path_experiment_dir)) # return [(json, experiment_name)] when we have experiment name or # [(json, None)] if no experiment name has been specified. # If the folder contains the output of multiple experiments, return # [(json1, None), (json2, None) .... ] return list(zip(jsons, [experiment_name] * len(jsons)))
def test_locate_files_works(self): config_dir = 'temp_output' os.makedirs(config_dir, exist_ok=True) paths = 'file1.csv' full_path = os.path.abspath(os.path.join(config_dir, paths)) open(full_path, 'a').close() result = DataReader.locate_files(paths, config_dir) rmtree(config_dir) eq_(result, full_path)
def check_experiment_dir(experiment_dir, configpath): """ Check that the supplied experiment directory exists and contains the output of the rsmtool experiment. Parameters ---------- experiment_dir : str Supplied path to the experiment_dir. configpath : str Path to the directory containing the configuration file. Returns ------- jsons : list A list paths to all configuration json files contained in the output directory Raises ------ FileNotFoundError If the directory does not exist or does not contain and output of an RSMTool experiment. """ full_path_experiment_dir = DataReader.locate_files(experiment_dir, configpath) if not full_path_experiment_dir: raise FileNotFoundError("The directory {} " "does not exist.".format(experiment_dir)) else: # check that there is an output directory csvdir = normpath(join(full_path_experiment_dir, 'output')) if not exists(csvdir): raise FileNotFoundError( "The directory {} does not contain " "the output of an rsmtool " "experiment.".format(full_path_experiment_dir)) # find the json configuration files for all experiments stored in this directory jsons = glob.glob(join(csvdir, '*.json')) if len(jsons) == 0: raise FileNotFoundError( "The directory {} does not contain " "the .json configuration files for rsmtool " "experiments.".format(full_path_experiment_dir)) return jsons
def check_experiment_dir(experiment_dir, configpath): """ Check that the supplied experiment directory exists and contains the output of the rsmtool experiment. Parameters ---------- experiment_dir : str Supplied path to the experiment_dir. configpath : str Path to the directory containing the configuration file. Returns ------- jsons : list A list paths to all configuration json files contained in the output directory Raises ------ FileNotFoundError If the directory does not exist or does not contain and output of an RSMTool experiment. """ full_path_experiment_dir = DataReader.locate_files(experiment_dir, configpath) if not full_path_experiment_dir: raise FileNotFoundError("The directory {} " "does not exist.".format(experiment_dir)) else: # check that there is an output directory csvdir = normpath(join(full_path_experiment_dir, 'output')) if not exists(csvdir): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(full_path_experiment_dir)) # find the json configuration files for all experiments stored in this directory jsons = glob.glob(join(csvdir, '*.json')) if len(jsons) == 0: raise FileNotFoundError("The directory {} does not contain " "the .json configuration files for rsmtool " "experiments.".format(full_path_experiment_dir)) return jsons
def test_locate_files_wrong_type(self): paths = {'file1.csv', 'file2.xlsx'} config_dir = 'output' DataReader.locate_files(paths, config_dir)
def test_locate_files_str(self): paths = 'file1.csv' config_dir = 'output' result = DataReader.locate_files(paths, config_dir) eq_(result, None)
def run_experiment(config_file_or_obj, output_dir): """ Run RSMTool experiment using the given configuration file and generate all outputs in the given directory. Parameters ---------- config_file_or_obj : str or Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved # Get absolute paths to output directories csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) featuredir = abspath(join(output_dir, 'feature')) # Make directories, if necessary makedirs(csvdir, exist_ok=True) makedirs(figdir, exist_ok=True) makedirs(reportdir, exist_ok=True) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read from file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config(config_file_or_obj) # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = getcwd() logger.info('Saving configuration file.') configuration.save(output_dir) # Get output format file_format = configuration.get('file_format', 'csv') # Get DataWriter object writer = DataWriter(configuration['experiment_id']) # Get the paths and names for the DataReader (file_names, file_paths_org) = configuration.get_names_and_paths(['train_file', 'test_file', 'features', 'feature_subset_file'], ['train', 'test', 'feature_specs', 'feature_subset_specs']) file_paths = DataReader.locate_files(file_paths_org, configpath) # if there are any missing files after trying to locate # all expected files, raise an error if None in file_paths: missing_file_paths = [file_paths_org[idx] for idx, path in enumerate(file_paths) if path is None] raise FileNotFoundError('The following files were not found: ' '{}'.format(repr(missing_file_paths))) # Use the default converter for both train and test converters = {'train': configuration.get_default_converter(), 'test': configuration.get_default_converter()} logger.info('Reading in all data from files.') # Initialize the reader reader = DataReader(file_paths, file_names, converters) data_container = reader.read() logger.info('Preprocessing all features.') # Initialize the processor processor = FeaturePreprocessor() (processed_config, processed_container) = processor.process_data(configuration, data_container) # Rename certain frames with more descriptive names # for writing out experiment files rename_dict = {'train_excluded': 'train_excluded_responses', 'test_excluded': 'test_excluded_responses', 'train_length': 'train_response_lengths', 'train_flagged': 'train_responses_with_excluded_flags', 'test_flagged': 'test_responses_with_excluded_flags'} logger.info('Saving training and test set data to disk.') # Write out files writer.write_experiment_output(csvdir, processed_container, ['train_features', 'test_features', 'train_metadata', 'test_metadata', 'train_other_columns', 'test_other_columns', 'train_preprocessed_features', 'test_preprocessed_features', 'train_excluded', 'test_excluded', 'train_length', 'test_human_scores', 'train_flagged', 'test_flagged'], rename_dict, file_format=file_format) # Initialize the analyzer analyzer = Analyzer() (analyzed_config, analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool(processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, analyzed_container, file_format=file_format) logger.info('Training {} model.'.format(processed_config['model_name'])) # Initialize modeler modeler = Modeler() modeler.train(processed_config, processed_container, csvdir, figdir, file_format) # Identify the features used by the model selected_features = modeler.get_feature_names() # Add selected features to processed configuration processed_config['selected_features'] = selected_features # Write out files writer.write_feature_csv(featuredir, processed_container, selected_features, file_format=file_format) features_data_container = processed_container.copy() # Get selected feature info, and write out to file df_feature_info = features_data_container.feature_info.copy() df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin(selected_features)] selected_feature_dataset_dict = {'name': 'selected_feature_info', 'frame': df_selected_feature_info} features_data_container.add_dataset(selected_feature_dataset_dict, update=True) writer.write_experiment_output(csvdir, features_data_container, dataframe_names=['selected_feature_info'], new_names_dict={'selected_feature_info': 'feature'}, file_format=file_format) logger.info('Running analyses on training set.') (train_analyzed_config, train_analyzed_container) = analyzer.run_training_analyses(processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, train_analyzed_container, reset_index=True, file_format=file_format) # Use only selected features for predictions columns_for_prediction = ['spkitemid', 'sc1'] + selected_features train_for_prediction = processed_container.train_preprocessed_features[columns_for_prediction] test_for_prediction = processed_container.test_preprocessed_features[columns_for_prediction] logged_str = 'Generating training and test set predictions' logged_str += ' (expected scores).' if configuration['predict_expected_scores'] else '.' logger.info(logged_str) (pred_config, pred_data_container) = modeler.predict_train_and_test(train_for_prediction, test_for_prediction, processed_config) # Write out files writer.write_experiment_output(csvdir, pred_data_container, new_names_dict={'pred_test': 'pred_processed'}, file_format=file_format) original_coef_file = join(csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'], file_format)) # If coefficients file exists, then generate # scaled coefficients and save to file if exists(original_coef_file): logger.info('Scaling the coefficients and saving them to disk') try: # Scale coefficients, and return DataContainer w/ scaled coefficients scaled_data_container = modeler.scale_coefficients(pred_config) # Write out files to disk writer.write_experiment_output(csvdir, scaled_data_container, file_format=file_format) except AttributeError: raise ValueError("It appears you are trying to save two different " "experiments to the same directory using the same " "ID. Please clear the content of the directory and " "rerun both experiments using different " "experiment IDs.") # Add processed data_container frames to pred_data_container new_pred_data_container = pred_data_container + processed_container logger.info('Running prediction analyses.') (pred_analysis_config, pred_analysis_data_container) = analyzer.run_prediction_analyses(new_pred_data_container, pred_config) # Write out files writer.write_experiment_output(csvdir, pred_analysis_data_container, reset_index=True, file_format=file_format) # Initialize reporter reporter = Reporter() # generate the report logger.info('Starting report generation.') reporter.create_report(processed_config, csvdir, figdir)
def run_comparison(config_file_or_obj, output_dir): """ Run an ``rsmcompare`` experiment using the given configuration file and generate the report in the given directory. Parameters ---------- config_file_or_obj : str or Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config( config_file_or_obj, context='rsmcompare') # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = os.getcwd() logger.info('Saving configuration file.') configuration.save(output_dir) # get the information about the "old" experiment experiment_id_old = configuration['experiment_id_old'] experiment_dir_old = DataReader.locate_files( configuration['experiment_dir_old'], configpath) if not experiment_dir_old: raise FileNotFoundError("The directory {} " "does not exist.".format( configuration['experiment_dir_old'])) else: csvdir_old = normpath(join(experiment_dir_old, 'output')) figdir_old = normpath(join(experiment_dir_old, 'figure')) if not exists(csvdir_old) or not exists(figdir_old): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(experiment_dir_old)) check_experiment_id(experiment_dir_old, experiment_id_old) # get the information about the "new" experiment experiment_id_new = configuration['experiment_id_new'] experiment_dir_new = DataReader.locate_files( configuration['experiment_dir_new'], configpath) if not experiment_dir_new: raise FileNotFoundError("The directory {} " "does not exist.".format( configuration['experiment_dir_new'])) else: csvdir_new = normpath(join(experiment_dir_new, 'output')) figdir_new = normpath(join(experiment_dir_new, 'figure')) if not exists(csvdir_new) or not exists(figdir_new): raise FileNotFoundError("The directory {} does not contain " "the output of an rsmtool " "experiment.".format(experiment_dir_new)) check_experiment_id(experiment_dir_new, experiment_id_new) # are there specific general report sections we want to include? general_report_sections = configuration['general_sections'] # what about the special or custom sections? special_report_sections = configuration['special_sections'] custom_report_section_paths = configuration['custom_sections'] # if custom report sections exist, locate sections; otherwise, create empty list if custom_report_section_paths: logger.info('Locating custom report sections') custom_report_sections = Reporter.locate_custom_sections( custom_report_section_paths, configpath) else: custom_report_sections = [] # get the section order section_order = configuration['section_order'] # get the subgroups if any subgroups = configuration.get('subgroups') # Initialize reporter reporter = Reporter() chosen_notebook_files = reporter.get_ordered_notebook_files( general_report_sections, special_report_sections, custom_report_sections, section_order, subgroups, model_type=None, context='rsmcompare') # add chosen notebook files to configuration configuration['chosen_notebook_files'] = chosen_notebook_files # now generate the comparison report logger.info('Starting report generation.') reporter.create_comparison_report(configuration, csvdir_old, figdir_old, csvdir_new, figdir_new, output_dir)
def run_evaluation(config_file_or_obj, output_dir): """ Run an `rsmeval` experiment using the given configuration file and generate all outputs in the given directory. Parameters ---------- config_file_or_obj : str or configuration_parser.Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) os.makedirs(csvdir, exist_ok=True) os.makedirs(figdir, exist_ok=True) os.makedirs(reportdir, exist_ok=True) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config( config_file_or_obj, context='rsmeval') # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = os.getcwd() logger.info('Saving configuration file.') configuration.save(output_dir) # Get output format file_format = configuration.get('file_format', 'csv') # Get DataWriter object writer = DataWriter(configuration['experiment_id']) # Make sure prediction file can be located if not DataReader.locate_files(configuration['predictions_file'], configpath): raise FileNotFoundError('Error: Predictions file {} ' 'not found.\n'.format( configuration['predictions_file'])) scale_with = configuration.get('scale_with') # scale_with can be one of the following: # (a) None : the predictions are assumed to be 'raw' and should be used as is # when computing the metrics; the names for the final columns are # 'raw', 'raw_trim' and 'raw_trim_round'. # (b) 'asis' : the predictions are assumed to be pre-scaled and should be used as is # when computing the metrics; the names for the final columns are # 'scale', 'scale_trim' and 'scale_trim_round'. # (c) a CSV file : the predictions are assumed to be 'raw' and should be scaled # before computing the metrics; the names for the final columns are # 'scale', 'scale_trim' and 'scale_trim_round'. # Check whether we want to do scaling do_scaling = (scale_with is not None and scale_with != 'asis') # The paths to files and names for data container properties paths = ['predictions_file'] names = ['predictions'] # If we want to do scaling, get the scale file if do_scaling: # Make sure scale file can be located scale_file_location = DataReader.locate_files(scale_with, configpath) if not scale_file_location: raise FileNotFoundError('Could not find scaling file {}.' ''.format(scale_file_location)) paths.append('scale_with') names.append('scale') # Get the paths, names, and converters for the DataReader (file_names, file_paths) = configuration.get_names_and_paths(paths, names) file_paths = DataReader.locate_files(file_paths, configpath) converters = {'predictions': configuration.get_default_converter()} logger.info('Reading predictions: {}.'.format( configuration['predictions_file'])) # Initialize the reader reader = DataReader(file_paths, file_names, converters) data_container = reader.read() logger.info('Preprocessing predictions.') # Initialize the processor processor = FeaturePreprocessor() (processed_config, processed_container) = processor.process_data(configuration, data_container, context='rsmeval') logger.info('Saving pre-processed predictions and metadata to disk.') writer.write_experiment_output(csvdir, processed_container, new_names_dict={ 'pred_test': 'pred_processed', 'test_excluded': 'test_excluded_responses' }, file_format=file_format) # Initialize the analyzer analyzer = Analyzer() # do the data composition stats (analyzed_config, analyzed_container) = analyzer.run_data_composition_analyses_for_rsmeval( processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, analyzed_container, file_format=file_format) for_pred_data_container = analyzed_container + processed_container # run the analyses on the predictions of the model` logger.info('Running analyses on predictions.') (pred_analysis_config, pred_analysis_data_container) = analyzer.run_prediction_analyses( for_pred_data_container, analyzed_config) writer.write_experiment_output(csvdir, pred_analysis_data_container, reset_index=True, file_format=file_format) # Initialize reporter reporter = Reporter() # generate the report logger.info('Starting report generation.') reporter.create_report(processed_config, csvdir, figdir, context='rsmeval')
def run_experiment(config_file_or_obj, output_dir): """ Run RSMTool experiment using the given configuration file and generate all outputs in the given directory. Parameters ---------- config_file_or_obj : str or Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the experiment output directory. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # create the 'output' and the 'figure' sub-directories # where all the experiment output such as the CSV files # and the box plots will be saved # Get absolute paths to output directories csvdir = abspath(join(output_dir, 'output')) figdir = abspath(join(output_dir, 'figure')) reportdir = abspath(join(output_dir, 'report')) featuredir = abspath(join(output_dir, 'feature')) # Make directories, if necessary makedirs(csvdir, exist_ok=True) makedirs(figdir, exist_ok=True) makedirs(reportdir, exist_ok=True) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read from file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) configuration = parser.read_normalize_validate_and_process_config( config_file_or_obj) # get the directory where the configuration file lives configpath = dirname(config_file_or_obj) else: configuration = config_file_or_obj if configuration.filepath is not None: configpath = dirname(configuration.filepath) else: configpath = getcwd() logger.info('Saving configuration file.') configuration.save(output_dir) # Get output format file_format = configuration.get('file_format', 'csv') # Get DataWriter object writer = DataWriter(configuration['experiment_id']) # Get the paths and names for the DataReader (file_names, file_paths_org) = configuration.get_names_and_paths( ['train_file', 'test_file', 'features', 'feature_subset_file'], ['train', 'test', 'feature_specs', 'feature_subset_specs']) file_paths = DataReader.locate_files(file_paths_org, configpath) # if there are any missing files after trying to locate # all expected files, raise an error if None in file_paths: missing_file_paths = [ file_paths_org[idx] for idx, path in enumerate(file_paths) if path is None ] raise FileNotFoundError('The following files were not found: ' '{}'.format(repr(missing_file_paths))) # Use the default converter for both train and test converters = { 'train': configuration.get_default_converter(), 'test': configuration.get_default_converter() } logger.info('Reading in all data from files.') # Initialize the reader reader = DataReader(file_paths, file_names, converters) data_container = reader.read() logger.info('Preprocessing all features.') # Initialize the processor processor = FeaturePreprocessor() (processed_config, processed_container) = processor.process_data(configuration, data_container) # Rename certain frames with more descriptive names # for writing out experiment files rename_dict = { 'train_excluded': 'train_excluded_responses', 'test_excluded': 'test_excluded_responses', 'train_length': 'train_response_lengths', 'train_flagged': 'train_responses_with_excluded_flags', 'test_flagged': 'test_responses_with_excluded_flags' } logger.info('Saving training and test set data to disk.') # Write out files writer.write_experiment_output( csvdir, processed_container, [ 'train_features', 'test_features', 'train_metadata', 'test_metadata', 'train_other_columns', 'test_other_columns', 'train_preprocessed_features', 'test_preprocessed_features', 'train_excluded', 'test_excluded', 'train_length', 'test_human_scores', 'train_flagged', 'test_flagged' ], rename_dict, file_format=file_format) # Initialize the analyzer analyzer = Analyzer() (analyzed_config, analyzed_container) = analyzer.run_data_composition_analyses_for_rsmtool( processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, analyzed_container, file_format=file_format) logger.info('Training {} model.'.format(processed_config['model_name'])) # Initialize modeler modeler = Modeler() modeler.train(processed_config, processed_container, csvdir, figdir, file_format) # Identify the features used by the model selected_features = modeler.get_feature_names() # Add selected features to processed configuration processed_config['selected_features'] = selected_features # Write out files writer.write_feature_csv(featuredir, processed_container, selected_features, file_format=file_format) features_data_container = processed_container.copy() # Get selected feature info, and write out to file df_feature_info = features_data_container.feature_info.copy() df_selected_feature_info = df_feature_info[df_feature_info['feature'].isin( selected_features)] selected_feature_dataset_dict = { 'name': 'selected_feature_info', 'frame': df_selected_feature_info } features_data_container.add_dataset(selected_feature_dataset_dict, update=True) writer.write_experiment_output( csvdir, features_data_container, dataframe_names=['selected_feature_info'], new_names_dict={'selected_feature_info': 'feature'}, file_format=file_format) logger.info('Running analyses on training set.') (train_analyzed_config, train_analyzed_container) = analyzer.run_training_analyses( processed_container, processed_config) # Write out files writer.write_experiment_output(csvdir, train_analyzed_container, reset_index=True, file_format=file_format) # Use only selected features for predictions columns_for_prediction = ['spkitemid', 'sc1'] + selected_features train_for_prediction = processed_container.train_preprocessed_features[ columns_for_prediction] test_for_prediction = processed_container.test_preprocessed_features[ columns_for_prediction] logged_str = 'Generating training and test set predictions' logged_str += ' (expected scores).' if configuration[ 'predict_expected_scores'] else '.' logger.info(logged_str) (pred_config, pred_data_container) = modeler.predict_train_and_test( train_for_prediction, test_for_prediction, processed_config) # Write out files writer.write_experiment_output( csvdir, pred_data_container, new_names_dict={'pred_test': 'pred_processed'}, file_format=file_format) original_coef_file = join( csvdir, '{}_coefficients.{}'.format(pred_config['experiment_id'], file_format)) # If coefficients file exists, then generate # scaled coefficients and save to file if exists(original_coef_file): logger.info('Scaling the coefficients and saving them to disk') try: # Scale coefficients, and return DataContainer w/ scaled coefficients scaled_data_container = modeler.scale_coefficients(pred_config) # Write out files to disk writer.write_experiment_output(csvdir, scaled_data_container, file_format=file_format) except AttributeError: raise ValueError( "It appears you are trying to save two different " "experiments to the same directory using the same " "ID. Please clear the content of the directory and " "rerun both experiments using different " "experiment IDs.") # Add processed data_container frames to pred_data_container new_pred_data_container = pred_data_container + processed_container logger.info('Running prediction analyses.') (pred_analysis_config, pred_analysis_data_container) = analyzer.run_prediction_analyses( new_pred_data_container, pred_config) # Write out files writer.write_experiment_output(csvdir, pred_analysis_data_container, reset_index=True, file_format=file_format) # Initialize reporter reporter = Reporter() # generate the report logger.info('Starting report generation.') reporter.create_report(processed_config, csvdir, figdir)
def compute_and_save_predictions(config_file_or_obj, output_file, feats_file=None): """ Run ``rsmpredict`` with given configuration file and generate predictions (and, optionally, pre-processed feature values). Parameters ---------- config_file_or_obj : str or configuration_parser.Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the output directory for saving files. feats_file (optional): str Path to the output file for saving preprocessed feature values. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) config = parser.read_normalize_validate_and_process_config(config_file_or_obj, context='rsmpredict') # get the directory where the config file lives configpath = dirname(config_file_or_obj) else: config = config_file_or_obj if config.filepath is not None: configpath = dirname(config.filepath) else: configpath = os.getcwd() # get the experiment ID experiment_id = config['experiment_id'] # Get output format file_format = config.get('file_format', 'csv') # Get DataWriter object writer = DataWriter(experiment_id) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = DataReader.locate_files(config['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist' ''.format(config['input_features_file'])) experiment_dir = DataReader.locate_files(config['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.' ''.format(config['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError('The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError('The directory {} does not contain any rsmtool models.' ''.format(experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError('{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format(experiment_output_dir, expected_file_name)) # model_files = glob.glob(join(experiment_output_dir, '*.model')) # if not model_files: # raise FileNotFoundError('The directory {} does not contain any rsmtool models. ' # ''.format(experiment_output_dir)) logger.info('Reading input files.') feature_info = join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)) post_processing = join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id)) file_paths = [input_features_file, feature_info, post_processing] file_names = ['input_features', 'feature_info', 'postprocessing_params'] converters = {'input_features': config.get_default_converter()} # Initialize the reader reader = DataReader(file_paths, file_names, converters) data_container = reader.read(kwargs_dict={'feature_info': {'index_col': 0}}) # load the Modeler to generate the predictions model = Modeler.load_from_file(join(experiment_output_dir, '{}.model'.format(experiment_id))) # Add the model to the configuration object config['model'] = model # Initialize the processor processor = FeaturePreprocessor() (processed_config, processed_container) = processor.process_data(config, data_container, context='rsmpredict') # save the pre-processed features to disk if we were asked to if feats_file is not None: logger.info('Saving pre-processed feature values to {}'.format(feats_file)) feats_dir = dirname(feats_file) # create any directories needed for the output file os.makedirs(feats_dir, exist_ok=True) _, feats_filename = split(feats_file) feats_filename, _ = splitext(feats_filename) # Write out files writer.write_experiment_output(feats_dir, processed_container, include_experiment_id=False, dataframe_names=['features_processed'], new_names_dict={'features_processed': feats_filename}, file_format=file_format) if (output_file.lower().endswith('.csv') or output_file.lower().endswith('.xlsx')): output_dir = dirname(output_file) _, filename = split(output_file) filename, _ = splitext(filename) else: output_dir = output_file filename = 'predictions_with_metadata' # create any directories needed for the output file os.makedirs(output_dir, exist_ok=True) # save the predictions to disk logger.info('Saving predictions.') # Write out files writer.write_experiment_output(output_dir, processed_container, include_experiment_id=False, dataframe_names=['predictions_with_metadata'], new_names_dict={'predictions_with_metadata': filename}, file_format=file_format) # save excluded responses to disk if not processed_container.excluded.empty: # save the predictions to disk logger.info('Saving excluded responses to {}'.format(join(output_dir, '{}_excluded_responses.csv' ''.format(filename)))) # Write out files writer.write_experiment_output(output_dir, processed_container, include_experiment_id=False, dataframe_names=['excluded'], new_names_dict={'excluded': '{}_excluded_responses' ''.format(filename)}, file_format=file_format)
def compute_and_save_predictions(config_file_or_obj, output_file, feats_file=None): """ Run ``rsmpredict`` with given configuration file and generate predictions (and, optionally, pre-processed feature values). Parameters ---------- config_file_or_obj : str or configuration_parser.Configuration Path to the experiment configuration file. Users can also pass a `Configuration` object that is in memory. output_dir : str Path to the output directory for saving files. feats_file (optional): str Path to the output file for saving preprocessed feature values. Raises ------ ValueError If any of the required fields are missing or ill-specified. """ logger = logging.getLogger(__name__) # Allow users to pass Configuration object to the # `config_file_or_obj` argument, rather than read file if not isinstance(config_file_or_obj, Configuration): # Instantiate configuration parser object parser = ConfigurationParser.get_configparser(config_file_or_obj) config = parser.read_normalize_validate_and_process_config( config_file_or_obj, context='rsmpredict') # get the directory where the config file lives configpath = dirname(config_file_or_obj) else: config = config_file_or_obj if config.filepath is not None: configpath = dirname(config.filepath) else: configpath = os.getcwd() # get the experiment ID experiment_id = config['experiment_id'] # Get output format file_format = config.get('file_format', 'csv') # Get DataWriter object writer = DataWriter(experiment_id) # get the input file containing the feature values # for which we want to generate the predictions input_features_file = DataReader.locate_files( config['input_features_file'], configpath) if not input_features_file: raise FileNotFoundError('Input file {} does not exist' ''.format(config['input_features_file'])) experiment_dir = DataReader.locate_files(config['experiment_dir'], configpath) if not experiment_dir: raise FileNotFoundError('The directory {} does not exist.' ''.format(config['experiment_dir'])) else: experiment_output_dir = normpath(join(experiment_dir, 'output')) if not exists(experiment_output_dir): raise FileNotFoundError( 'The directory {} does not contain ' 'the output of an rsmtool experiment.'.format(experiment_dir)) # find all the .model files in the experiment output directory model_files = glob.glob(join(experiment_output_dir, '*.model')) if not model_files: raise FileNotFoundError( 'The directory {} does not contain any rsmtool models.' ''.format(experiment_output_dir)) experiment_ids = [splitext(basename(mf))[0] for mf in model_files] if experiment_id not in experiment_ids: raise FileNotFoundError( '{} does not contain a model for the experiment "{}". ' 'The following experiments are contained in this ' 'directory: {}'.format(experiment_output_dir, experiment_id, experiment_ids)) # check that the directory contains outher required files required_file_types = ['feature', 'postprocessing_params'] for file_type in required_file_types: expected_file_name = "{}_{}.csv".format(experiment_id, file_type) if not exists(join(experiment_output_dir, expected_file_name)): raise FileNotFoundError('{} does not contain the required file ' '{} that was generated during the ' 'original model training'.format( experiment_output_dir, expected_file_name)) # model_files = glob.glob(join(experiment_output_dir, '*.model')) # if not model_files: # raise FileNotFoundError('The directory {} does not contain any rsmtool models. ' # ''.format(experiment_output_dir)) logger.info('Reading input files.') feature_info = join(experiment_output_dir, '{}_feature.csv'.format(experiment_id)) post_processing = join( experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id)) file_paths = [input_features_file, feature_info, post_processing] file_names = ['input_features', 'feature_info', 'postprocessing_params'] converters = {'input_features': config.get_default_converter()} # Initialize the reader reader = DataReader(file_paths, file_names, converters) data_container = reader.read( kwargs_dict={'feature_info': { 'index_col': 0 }}) # load the Modeler to generate the predictions model = Modeler.load_from_file( join(experiment_output_dir, '{}.model'.format(experiment_id))) # Add the model to the configuration object config['model'] = model # Initialize the processor processor = FeaturePreprocessor() (processed_config, processed_container) = processor.process_data(config, data_container, context='rsmpredict') # save the pre-processed features to disk if we were asked to if feats_file is not None: logger.info( 'Saving pre-processed feature values to {}'.format(feats_file)) feats_dir = dirname(feats_file) # create any directories needed for the output file os.makedirs(feats_dir, exist_ok=True) _, feats_filename = split(feats_file) feats_filename, _ = splitext(feats_filename) # Write out files writer.write_experiment_output( feats_dir, processed_container, include_experiment_id=False, dataframe_names=['features_processed'], new_names_dict={'features_processed': feats_filename}, file_format=file_format) if (output_file.lower().endswith('.csv') or output_file.lower().endswith('.xlsx')): output_dir = dirname(output_file) _, filename = split(output_file) filename, _ = splitext(filename) else: output_dir = output_file filename = 'predictions_with_metadata' # create any directories needed for the output file os.makedirs(output_dir, exist_ok=True) # save the predictions to disk logger.info('Saving predictions.') # Write out files writer.write_experiment_output( output_dir, processed_container, include_experiment_id=False, dataframe_names=['predictions_with_metadata'], new_names_dict={'predictions_with_metadata': filename}, file_format=file_format) # save excluded responses to disk if not processed_container.excluded.empty: # save the predictions to disk logger.info('Saving excluded responses to {}'.format( join(output_dir, '{}_excluded_responses.csv' ''.format(filename)))) # Write out files writer.write_experiment_output(output_dir, processed_container, include_experiment_id=False, dataframe_names=['excluded'], new_names_dict={ 'excluded': '{}_excluded_responses' ''.format(filename) }, file_format=file_format)