def test_test_file_and_test_directory(): """ Test that test_file + test_directory = ValueError """ # Run experiment config_path = fill_in_config_paths_for_single_file( join(_my_dir, "configs", "test_single_file" ".template.cfg"), join(_my_dir, 'train', 'train_single_file' '.jsonlines'), join(_my_dir, 'test', 'test_single_file.' 'jsonlines'), test_directory='foo') _parse_config_file(config_path)
def test_config_parsing_relative_input_paths(): train_dir = '../train' train_file = join(train_dir, 'f0.jsonlines') test_file = join(train_dir, 'f1.jsonlines') output_dir = '../output' custom_learner_path_input = join('other', 'majority_class_learner.py') # make a simple config file that has relative paths values_to_fill_dict = { 'experiment_name': 'config_parsing', 'task': 'evaluate', 'train_file': train_file, 'test_file': test_file, 'learners': "['LogisticRegression']", 'log': output_dir, 'results': output_dir, 'objective': 'f1_score_micro' } config_template_path = join(_my_dir, 'configs', 'test_relative_paths.template.cfg') config_path = fill_in_config_options(config_template_path, values_to_fill_dict, 'relative_paths') (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_path)
def test_default_learning_curve_options(): train_dir = join(_my_dir, 'train') output_dir = join(_my_dir, 'output') values_to_fill_dict = { 'experiment_name': 'config_parsing', 'task': 'learning_curve', 'train_directory': train_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression', 'MultinomialNB']", 'log': output_dir, 'results': output_dir, 'objective': 'f1_score_macro' } config_template_path = join(_my_dir, 'configs', 'test_config_parsing.template.cfg') config_path = fill_in_config_options(config_template_path, values_to_fill_dict, 'default_learning_curve') (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path, learning_curve_cv_folds_list, learning_curve_train_sizes) = _parse_config_file(config_path) eq_(learning_curve_cv_folds_list, [10, 10]) ok_(np.all(learning_curve_train_sizes == np.linspace(0.1, 1.0, 5)))
def test_test_file_and_test_directory(): """ Test that test_file + test_directory = ValueError """ # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file" ".template.cfg"), join(_my_dir, 'train', 'train_single_file' '.jsonlines'), join(_my_dir, 'test', 'test_single_file.' 'jsonlines'), test_directory='foo') _parse_config_file(config_path)
def test_config_parsing_relative_input_paths(): train_dir = '../train' train_file = join(train_dir, 'f0.jsonlines') test_file = join(train_dir, 'f1.jsonlines') output_dir = '../output' custom_learner_path_input = join('other', 'majority_class_learner.py') # make a simple config file that has relative paths values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', 'train_file': train_file, 'test_file': test_file, 'learners': "['LogisticRegression']", 'log': output_dir, 'results': output_dir, 'objective': 'f1_score_micro'} config_template_path = join(_my_dir, 'configs', 'test_relative_paths.template.cfg') config_path = fill_in_config_options(config_template_path, values_to_fill_dict, 'relative_paths') (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_path)
def test_config_parsing_relative_input_path(): train_dir = join('..', 'train') output_dir = join(_my_dir, 'output') # make a simple config file that has an invalid option values_to_fill_dict = { 'experiment_name': 'config_parsing', 'task': 'cross_validate', 'train_directory': train_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression']", 'log': output_dir, 'results': output_dir, 'objective': 'f1_score_macro' } config_template_path = join(_my_dir, 'configs', 'test_config_parsing.template.cfg') config_path = fill_in_config_options(config_template_path, values_to_fill_dict, 'mislocated_input_file') (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_path) eq_(normpath(train_path), (join(_my_dir, 'train')))
def test_setting_number_of_cv_folds(): train_dir = join(_my_dir, 'train') output_dir = join(_my_dir, 'output') # make a simple config file that does not set cv_folds values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'cross_validate', 'train_directory': train_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression']", 'log': output_dir, 'results': output_dir, 'num_cv_folds': "5", 'objective': 'f1_score_macro'} config_template_path = join(_my_dir, 'configs', 'test_config_parsing.template.cfg') config_path = fill_in_config_options(config_template_path, values_to_fill_dict, 'default_cv_folds') (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_path) eq_(cv_folds, 5)
def test_setting_fixed_parameters(): train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') # make a simple config file that does not set cv_folds values_to_fill_dict = { 'experiment_name': 'config_parsing', 'task': 'evaluate', 'train_directory': train_dir, 'test_directory': test_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LinearSVC']", 'log': output_dir, 'results': output_dir, 'fixed_parameters': "[{'C': [1e-6, 0.001, 1, 10, 100, 1e5]}]", 'objective': 'f1_score_macro' } config_template_path = join(_my_dir, 'configs', 'test_config_parsing.template.cfg') config_path = fill_in_config_options(config_template_path, values_to_fill_dict, 'fixed_parameters') (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path, learning_curve_cv_folds_list, learning_curve_train_sizes) = _parse_config_file(config_path) eq_(fixed_parameter_list[0]['C'][0], 1e-6) eq_(fixed_parameter_list[0]['C'][1], 1e-3) eq_(fixed_parameter_list[0]['C'][2], 1) eq_(fixed_parameter_list[0]['C'][3], 10) eq_(fixed_parameter_list[0]['C'][4], 100) eq_(fixed_parameter_list[0]['C'][5], 1e5)
def test_setting_fixed_parameters(): train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') # make a simple config file that does not set cv_folds values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', 'train_directory': train_dir, 'test_directory': test_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LinearSVC']", 'log': output_dir, 'results': output_dir, 'fixed_parameters': "[{'C': [1e-6, 0.001, 1, 10, 100, 1e5]}]", 'objective': 'f1_score_macro'} config_template_path = join(_my_dir, 'configs', 'test_config_parsing.template.cfg') config_path = fill_in_config_options(config_template_path, values_to_fill_dict, 'fixed_parameters') (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_path) eq_(fixed_parameter_list[0]['C'][0], 1e-6) eq_(fixed_parameter_list[0]['C'][1], 1e-3) eq_(fixed_parameter_list[0]['C'][2], 1) eq_(fixed_parameter_list[0]['C'][3], 10) eq_(fixed_parameter_list[0]['C'][4], 100) eq_(fixed_parameter_list[0]['C'][5], 1e5)
def check_config_parsing_file_not_found_error(config_path): """ Assert that calling `_parse_config_file` on `config_path` raises FileNotFoundError """ _parse_config_file(config_path)
def check_config_parsing_key_error(config_path): """ Assert that calling `_parse_config_file` on `config_path` raises KeyError """ _parse_config_file(config_path)
def run_configuration(config_file, local=False, overwrite=True, queue='all.q', hosts=None, write_summary=True, quiet=False, ablation=0, resume=False): """ Takes a configuration file and runs the specified jobs on the grid. :param config_path: Path to the configuration file we would like to use. :type config_path: str :param local: Should this be run locally instead of on the cluster? :type local: bool :param overwrite: If the model files already exist, should we overwrite them instead of re-using them? :type overwrite: bool :param queue: The DRMAA queue to use if we're running on the cluster. :type queue: str :param hosts: If running on the cluster, these are the machines we should use. :type hosts: list of str :param write_summary: Write a tsv file with a summary of the results. :type write_summary: bool :param quiet: Suppress printing of "Loading..." messages. :type quiet: bool :param ablation: Number of features to remove when doing an ablation experiment. If positive, we will perform repeated ablation runs for all combinations of features removing the specified number at a time. If ``None``, we will use all combinations of all lengths. If 0, the default, no ablation is performed. If negative, a ``ValueError`` is raised. :type ablation: int or None :param resume: If result files already exist for an experiment, do not overwrite them. This is very useful when doing a large ablation experiment and part of it crashes. :type resume: bool :return: A list of paths to .json results files for each variation in the experiment. :rtype: list of str """ # Initialize logger logger = logging.getLogger(__name__) # Read configuration (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_file) # Check if we have gridmap if not local and not _HAVE_GRIDMAP: local = True logger.warning('gridmap 0.10.1+ not available. Forcing local ' 'mode. To run things on a DRMAA-compatible ' 'cluster, install gridmap>=0.10.1 via pip.') # if performing ablation, expand featuresets to include combinations of # features within those sets if ablation is None or ablation > 0: # Make new feature set lists so that we can iterate without issue expanded_fs = [] expanded_fs_names = [] for features, featureset_name in zip(featuresets, featureset_names): features = sorted(features) featureset = set(features) # Expand to all feature combinations if ablation is None if ablation is None: for i in range(1, len(features)): for excluded_features in combinations(features, i): expanded_fs.append(sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Otherwise, just expand removing the specified number at a time else: for excluded_features in combinations(features, ablation): expanded_fs.append(sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Also add version with nothing removed as baseline expanded_fs.append(features) expanded_fs_names.append(featureset_name + '_all') # Replace original feature set lists featuresets = expanded_fs featureset_names = expanded_fs_names elif ablation < 0: raise ValueError('Value for "ablation" argument must be either ' 'positive integer or None.') # the list of jobs submitted (if running on grid) if not local: jobs = [] # the list to hold the paths to all the result json files result_json_paths = [] # check if the length of the featureset_name exceeds the maximum length # allowed for featureset_name in featureset_names: if len(featureset_name) > 210: raise OSError('System generated file length "{}" exceeds the ' 'maximum length supported. Please specify names of ' 'your datasets with "featureset_names". If you are ' 'running ablation experiment, please reduce the ' 'length of the features in "featuresets" because the' ' auto-generated name would be longer than the file ' 'system can handle'.format(featureset_name)) # Run each featureset-learner combination for featureset, featureset_name in zip(featuresets, featureset_names): for learner_num, learner_name in enumerate(learners): # for the individual job name, we need to add the feature set name # and the learner name job_name_components = [experiment_name, featureset_name, learner_name] job_name = '_'.join(job_name_components) # change the prediction prefix to include the feature set prediction_prefix = join(prediction_dir, job_name) # the log file that stores the actual output of this script (e.g., # the tuned parameters, what kind of experiment was run, etc.) temp_logfile = join(log_path, '{}.log'.format(job_name)) # Figure out result json file path result_json_path = join(results_path, '{}.results.json'.format(job_name)) # save the path to the results json file that will be written result_json_paths.append(result_json_path) # If result file already exists and we're resuming, move on if resume and (exists(result_json_path) and os.path.getsize(result_json_path)): logger.info('Running in resume mode and %s exists, so skipping' ' job.', result_json_path) continue # create job if we're doing things on the grid job_args = {} job_args["experiment_name"] = experiment_name job_args["task"] = task job_args["sampler"] = sampler job_args["feature_hasher"] = feature_hasher job_args["hasher_features"] = hasher_features job_args["job_name"] = job_name job_args["featureset"] = featureset job_args["featureset_name"] = featureset_name job_args["learner_name"] = learner_name job_args["train_path"] = train_path job_args["test_path"] = test_path job_args["train_set_name"] = train_set_name job_args["test_set_name"] = test_set_name job_args["shuffle"] = do_shuffle job_args["model_path"] = model_path job_args["prediction_prefix"] = prediction_prefix job_args["grid_search"] = do_grid_search job_args["grid_objective"] = grid_objective job_args["suffix"] = suffix job_args["log_path"] = temp_logfile job_args["probability"] = probability job_args["results_path"] = results_path job_args["sampler_parameters"] = (fixed_sampler_parameters if fixed_sampler_parameters else dict()) job_args["fixed_parameters"] = (fixed_parameter_list[learner_num] if fixed_parameter_list else dict()) job_args["param_grid"] = (param_grid_list[learner_num] if param_grid_list else None) job_args["pos_label_str"] = pos_label_str job_args["overwrite"] = overwrite job_args["feature_scaling"] = feature_scaling job_args["min_feature_count"] = min_feature_count job_args["grid_search_jobs"] = grid_search_jobs job_args["grid_search_folds"] = grid_search_folds job_args["cv_folds"] = cv_folds job_args["do_stratified_folds"] = do_stratified_folds job_args["label_col"] = label_col job_args["id_col"] = id_col job_args["ids_to_floats"] = ids_to_floats job_args["quiet"] = quiet job_args["class_map"] = class_map job_args["custom_learner_path"] = custom_learner_path if not local: jobs.append(Job(_classify_featureset, [job_args], num_slots=(MAX_CONCURRENT_PROCESSES if do_grid_search else 1), name=job_name, queue=queue)) else: _classify_featureset(job_args) test_set_name = basename(test_path) # submit the jobs (if running on grid) if not local and _HAVE_GRIDMAP: if log_path: job_results = process_jobs(jobs, white_list=hosts, temp_dir=log_path) else: job_results = process_jobs(jobs, white_list=hosts) _check_job_results(job_results) # write out the summary results file if (task == 'cross_validate' or task == 'evaluate') and write_summary: summary_file_name = experiment_name + '_summary.tsv' file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(join(results_path, summary_file_name), file_mode) as output_file: _write_summary_file(result_json_paths, output_file, ablation=ablation) return result_json_paths
def check_config_parsing_value_error(config_path): """ Assert that calling `_parse_config_file` on `config_path` raises ValueError """ _parse_config_file(config_path)
def run_configuration(config_file, local=False, overwrite=True, queue='all.q', hosts=None, write_summary=True, quiet=False, ablation=0, resume=False): """ Takes a configuration file and runs the specified jobs on the grid. :param config_path: Path to the configuration file we would like to use. :type config_path: str :param local: Should this be run locally instead of on the cluster? :type local: bool :param overwrite: If the model files already exist, should we overwrite them instead of re-using them? :type overwrite: bool :param queue: The DRMAA queue to use if we're running on the cluster. :type queue: str :param hosts: If running on the cluster, these are the machines we should use. :type hosts: list of str :param write_summary: Write a tsv file with a summary of the results. :type write_summary: bool :param quiet: Suppress printing of "Loading..." messages. :type quiet: bool :param ablation: Number of features to remove when doing an ablation experiment. If positive, we will perform repeated ablation runs for all combinations of features removing the specified number at a time. If ``None``, we will use all combinations of all lengths. If 0, the default, no ablation is performed. If negative, a ``ValueError`` is raised. :type ablation: int or None :param resume: If result files already exist for an experiment, do not overwrite them. This is very useful when doing a large ablation experiment and part of it crashes. :type resume: bool :return: A list of paths to .json results files for each variation in the experiment. :rtype: list of str """ # Initialize logger logger = logging.getLogger(__name__) # Read configuration (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher, hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objectives, probability, results_path, pos_label_str, feature_scaling, min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_file) # Check if we have gridmap if not local and not _HAVE_GRIDMAP: local = True logger.warning('gridmap 0.10.1+ not available. Forcing local ' 'mode. To run things on a DRMAA-compatible ' 'cluster, install gridmap>=0.10.1 via pip.') # if performing ablation, expand featuresets to include combinations of # features within those sets if ablation is None or ablation > 0: # Make new feature set lists so that we can iterate without issue expanded_fs = [] expanded_fs_names = [] for features, featureset_name in zip(featuresets, featureset_names): features = sorted(features) featureset = set(features) # Expand to all feature combinations if ablation is None if ablation is None: for i in range(1, len(features)): for excluded_features in combinations(features, i): expanded_fs.append(sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Otherwise, just expand removing the specified number at a time else: for excluded_features in combinations(features, ablation): expanded_fs.append(sorted(featureset - set(excluded_features))) expanded_fs_names.append( featureset_name + '_minus_' + _munge_featureset_name(excluded_features)) # Also add version with nothing removed as baseline expanded_fs.append(features) expanded_fs_names.append(featureset_name + '_all') # Replace original feature set lists featuresets = expanded_fs featureset_names = expanded_fs_names elif ablation < 0: raise ValueError('Value for "ablation" argument must be either ' 'positive integer or None.') # the list of jobs submitted (if running on grid) if not local: jobs = [] # the list to hold the paths to all the result json files result_json_paths = [] # check if the length of the featureset_name exceeds the maximum length # allowed for featureset_name in featureset_names: if len(featureset_name) > 210: raise OSError('System generated file length "{}" exceeds the ' 'maximum length supported. Please specify names of ' 'your datasets with "featureset_names". If you are ' 'running ablation experiment, please reduce the ' 'length of the features in "featuresets" because the' ' auto-generated name would be longer than the file ' 'system can handle'.format(featureset_name)) # Run each featureset-learner combination for featureset, featureset_name in zip(featuresets, featureset_names): for learner_num, learner_name in enumerate(learners): for grid_objective in grid_objectives: # for the individual job name, we need to add the feature set name # and the learner name if len(grid_objectives) == 1: job_name_components = [experiment_name, featureset_name, learner_name] else: job_name_components = [experiment_name, featureset_name, learner_name, grid_objective] job_name = '_'.join(job_name_components) # change the prediction prefix to include the feature set prediction_prefix = join(prediction_dir, job_name) # the log file that stores the actual output of this script (e.g., # the tuned parameters, what kind of experiment was run, etc.) temp_logfile = join(log_path, '{}.log'.format(job_name)) # Figure out result json file path result_json_path = join(results_path, '{}.results.json'.format(job_name)) # save the path to the results json file that will be written result_json_paths.append(result_json_path) # If result file already exists and we're resuming, move on if resume and (exists(result_json_path) and os.path.getsize(result_json_path)): logger.info('Running in resume mode and %s exists, ' 'so skipping job.', result_json_path) continue # create job if we're doing things on the grid job_args = {} job_args["experiment_name"] = experiment_name job_args["task"] = task job_args["sampler"] = sampler job_args["feature_hasher"] = feature_hasher job_args["hasher_features"] = hasher_features job_args["job_name"] = job_name job_args["featureset"] = featureset job_args["featureset_name"] = featureset_name job_args["learner_name"] = learner_name job_args["train_path"] = train_path job_args["test_path"] = test_path job_args["train_set_name"] = train_set_name job_args["test_set_name"] = test_set_name job_args["shuffle"] = do_shuffle job_args["model_path"] = model_path job_args["prediction_prefix"] = prediction_prefix job_args["grid_search"] = do_grid_search job_args["grid_objective"] = grid_objective job_args["suffix"] = suffix job_args["log_path"] = temp_logfile job_args["probability"] = probability job_args["results_path"] = results_path job_args["sampler_parameters"] = (fixed_sampler_parameters if fixed_sampler_parameters else dict()) job_args["fixed_parameters"] = (fixed_parameter_list[learner_num] if fixed_parameter_list else dict()) job_args["param_grid"] = (param_grid_list[learner_num] if param_grid_list else None) job_args["pos_label_str"] = pos_label_str job_args["overwrite"] = overwrite job_args["feature_scaling"] = feature_scaling job_args["min_feature_count"] = min_feature_count job_args["grid_search_jobs"] = grid_search_jobs job_args["grid_search_folds"] = grid_search_folds job_args["cv_folds"] = cv_folds job_args["save_cv_folds"] = save_cv_folds job_args["do_stratified_folds"] = do_stratified_folds job_args["label_col"] = label_col job_args["id_col"] = id_col job_args["ids_to_floats"] = ids_to_floats job_args["quiet"] = quiet job_args["class_map"] = class_map job_args["custom_learner_path"] = custom_learner_path if not local: jobs.append(Job(_classify_featureset, [job_args], num_slots=(MAX_CONCURRENT_PROCESSES if do_grid_search else 1), name=job_name, queue=queue)) else: _classify_featureset(job_args) test_set_name = basename(test_path) # submit the jobs (if running on grid) if not local and _HAVE_GRIDMAP: if log_path: job_results = process_jobs(jobs, white_list=hosts, temp_dir=log_path) else: job_results = process_jobs(jobs, white_list=hosts) _check_job_results(job_results) # write out the summary results file if (task == 'cross_validate' or task == 'evaluate') and write_summary: summary_file_name = experiment_name + '_summary.tsv' file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(join(results_path, summary_file_name), file_mode) as output_file: _write_summary_file(result_json_paths, output_file, ablation=ablation) return result_json_paths
def check_config_parsing_type_error(config_path): """ Assert that calling `_parse_config_file` on `config_path` raises TypeError """ _parse_config_file(config_path)
def test_empty_config_name_raises_file_not_found_error(): """ Assert that calling _parse_config_file on an empty string raises IOError """ _parse_config_file("")