def test_test_file_and_test_directory():
    """
    Test that test_file + test_directory = ValueError
    """
    # Run experiment
    config_path = fill_in_config_paths_for_single_file(
        join(_my_dir, "configs", "test_single_file"
             ".template.cfg"),
        join(_my_dir, 'train', 'train_single_file'
             '.jsonlines'),
        join(_my_dir, 'test', 'test_single_file.'
             'jsonlines'),
        test_directory='foo')
    _parse_config_file(config_path)
Beispiel #2
0
def test_config_parsing_relative_input_paths():

    train_dir = '../train'
    train_file = join(train_dir, 'f0.jsonlines')
    test_file = join(train_dir, 'f1.jsonlines')
    output_dir = '../output'
    custom_learner_path_input = join('other', 'majority_class_learner.py')

    # make a simple config file that has relative paths
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_file': train_file,
        'test_file': test_file,
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'objective': 'f1_score_micro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_relative_paths.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict, 'relative_paths')

    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
     save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list,
     featureset_names, learners, prediction_dir, log_path, train_path,
     test_path, ids_to_floats, class_map,
     custom_learner_path) = _parse_config_file(config_path)
Beispiel #3
0
def test_default_learning_curve_options():

    train_dir = join(_my_dir, 'train')
    output_dir = join(_my_dir, 'output')

    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'learning_curve',
        'train_directory': train_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression', 'MultinomialNB']",
        'log': output_dir,
        'results': output_dir,
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'default_learning_curve')

    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
     save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list,
     featureset_names, learners, prediction_dir, log_path, train_path,
     test_path, ids_to_floats, class_map, custom_learner_path,
     learning_curve_cv_folds_list,
     learning_curve_train_sizes) = _parse_config_file(config_path)

    eq_(learning_curve_cv_folds_list, [10, 10])
    ok_(np.all(learning_curve_train_sizes == np.linspace(0.1, 1.0, 5)))
def test_test_file_and_test_directory():
    """
    Test that test_file + test_directory = ValueError
    """
    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train',
                                                            'train_single_file'
                                                            '.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file.'
                                                            'jsonlines'),
                                                       test_directory='foo')
    _parse_config_file(config_path)
Beispiel #5
0
def test_config_parsing_relative_input_paths():

    train_dir = '../train'
    train_file = join(train_dir, 'f0.jsonlines')
    test_file = join(train_dir, 'f1.jsonlines')
    output_dir = '../output'
    custom_learner_path_input = join('other', 'majority_class_learner.py')

    # make a simple config file that has relative paths
    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_file': train_file,
                           'test_file': test_file,
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'objective': 'f1_score_micro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_relative_paths.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'relative_paths')

    (experiment_name, task, sampler, fixed_sampler_parameters,
     feature_hasher, hasher_features, id_col, label_col, train_set_name,
     test_set_name, suffix, featuresets, do_shuffle, model_path,
     do_grid_search, grid_objective, probability, results_path,
     pos_label_str, feature_scaling, min_feature_count,
     grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds,
     fixed_parameter_list, param_grid_list, featureset_names, learners,
     prediction_dir, log_path, train_path, test_path, ids_to_floats,
     class_map, custom_learner_path) = _parse_config_file(config_path)
Beispiel #6
0
def test_config_parsing_relative_input_path():

    train_dir = join('..', 'train')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that has an invalid option
    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'cross_validate',
        'train_directory': train_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LogisticRegression']",
        'log': output_dir,
        'results': output_dir,
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')

    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'mislocated_input_file')

    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
     save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list,
     featureset_names, learners, prediction_dir, log_path, train_path,
     test_path, ids_to_floats, class_map,
     custom_learner_path) = _parse_config_file(config_path)

    eq_(normpath(train_path), (join(_my_dir, 'train')))
Beispiel #7
0
def test_setting_number_of_cv_folds():

    train_dir = join(_my_dir, 'train')
    output_dir = join(_my_dir, 'output')
    # make a simple config file that does not set cv_folds

    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'cross_validate',
                           'train_directory': train_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LogisticRegression']",
                           'log': output_dir,
                           'results': output_dir,
                           'num_cv_folds': "5",
                           'objective': 'f1_score_macro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'default_cv_folds')

    (experiment_name, task, sampler, fixed_sampler_parameters,
     feature_hasher, hasher_features, id_col, label_col, train_set_name,
     test_set_name, suffix, featuresets, do_shuffle, model_path,
     do_grid_search, grid_objective, probability, results_path,
     pos_label_str, feature_scaling, min_feature_count,
     grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds,
     fixed_parameter_list, param_grid_list, featureset_names, learners,
     prediction_dir, log_path, train_path, test_path, ids_to_floats,
     class_map, custom_learner_path) = _parse_config_file(config_path)

    eq_(cv_folds, 5)
Beispiel #8
0
def test_setting_fixed_parameters():

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that does not set cv_folds

    values_to_fill_dict = {
        'experiment_name': 'config_parsing',
        'task': 'evaluate',
        'train_directory': train_dir,
        'test_directory': test_dir,
        'featuresets': "[['f1', 'f2', 'f3']]",
        'learners': "['LinearSVC']",
        'log': output_dir,
        'results': output_dir,
        'fixed_parameters': "[{'C': [1e-6, 0.001, 1, 10, 100, 1e5]}]",
        'objective': 'f1_score_macro'
    }

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'fixed_parameters')

    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
     save_cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list,
     featureset_names, learners, prediction_dir, log_path, train_path,
     test_path, ids_to_floats, class_map, custom_learner_path,
     learning_curve_cv_folds_list,
     learning_curve_train_sizes) = _parse_config_file(config_path)

    eq_(fixed_parameter_list[0]['C'][0], 1e-6)
    eq_(fixed_parameter_list[0]['C'][1], 1e-3)
    eq_(fixed_parameter_list[0]['C'][2], 1)
    eq_(fixed_parameter_list[0]['C'][3], 10)
    eq_(fixed_parameter_list[0]['C'][4], 100)
    eq_(fixed_parameter_list[0]['C'][5], 1e5)
Beispiel #9
0
def test_setting_fixed_parameters():

    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    # make a simple config file that does not set cv_folds

    values_to_fill_dict = {'experiment_name': 'config_parsing',
                           'task': 'evaluate',
                           'train_directory': train_dir,
                           'test_directory': test_dir,
                           'featuresets': "[['f1', 'f2', 'f3']]",
                           'learners': "['LinearSVC']",
                           'log': output_dir,
                           'results': output_dir,
                           'fixed_parameters': "[{'C': [1e-6, 0.001, 1, 10, 100, 1e5]}]",
                           'objective': 'f1_score_macro'}

    config_template_path = join(_my_dir, 'configs',
                                'test_config_parsing.template.cfg')
    config_path = fill_in_config_options(config_template_path,
                                         values_to_fill_dict,
                                         'fixed_parameters')

    (experiment_name, task, sampler, fixed_sampler_parameters,
     feature_hasher, hasher_features, id_col, label_col, train_set_name,
     test_set_name, suffix, featuresets, do_shuffle, model_path,
     do_grid_search, grid_objective, probability, results_path,
     pos_label_str, feature_scaling, min_feature_count,
     grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, do_stratified_folds,
     fixed_parameter_list, param_grid_list, featureset_names, learners,
     prediction_dir, log_path, train_path, test_path, ids_to_floats,
     class_map, custom_learner_path) = _parse_config_file(config_path)

    eq_(fixed_parameter_list[0]['C'][0], 1e-6)
    eq_(fixed_parameter_list[0]['C'][1], 1e-3)
    eq_(fixed_parameter_list[0]['C'][2], 1)
    eq_(fixed_parameter_list[0]['C'][3], 10)
    eq_(fixed_parameter_list[0]['C'][4], 100)
    eq_(fixed_parameter_list[0]['C'][5], 1e5)
Beispiel #10
0
def check_config_parsing_file_not_found_error(config_path):
    """
    Assert that calling `_parse_config_file` on `config_path` raises FileNotFoundError
    """
    _parse_config_file(config_path)
Beispiel #11
0
def check_config_parsing_key_error(config_path):
    """
    Assert that calling `_parse_config_file` on `config_path` raises KeyError
    """
    _parse_config_file(config_path)
Beispiel #12
0
def check_config_parsing_file_not_found_error(config_path):
    """
    Assert that calling `_parse_config_file` on `config_path` raises FileNotFoundError
    """
    _parse_config_file(config_path)
Beispiel #13
0
def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
                      hosts=None, write_summary=True, quiet=False,
                      ablation=0, resume=False):
    """
    Takes a configuration file and runs the specified jobs on the grid.

    :param config_path: Path to the configuration file we would like to use.
    :type config_path: str
    :param local: Should this be run locally instead of on the cluster?
    :type local: bool
    :param overwrite: If the model files already exist, should we overwrite
                      them instead of re-using them?
    :type overwrite: bool
    :param queue: The DRMAA queue to use if we're running on the cluster.
    :type queue: str
    :param hosts: If running on the cluster, these are the machines we should
                  use.
    :type hosts: list of str
    :param write_summary: Write a tsv file with a summary of the results.
    :type write_summary: bool
    :param quiet: Suppress printing of "Loading..." messages.
    :type quiet: bool
    :param ablation: Number of features to remove when doing an ablation
                     experiment. If positive, we will perform repeated ablation
                     runs for all combinations of features removing the
                     specified number at a time. If ``None``, we will use all
                     combinations of all lengths. If 0, the default, no
                     ablation is performed. If negative, a ``ValueError`` is
                     raised.
    :type ablation: int or None
    :param resume: If result files already exist for an experiment, do not
                   overwrite them. This is very useful when doing a large
                   ablation experiment and part of it crashes.
    :type resume: bool

    :return: A list of paths to .json results files for each variation in the
             experiment.
    :rtype: list of str

    """
    # Initialize logger
    logger = logging.getLogger(__name__)

    # Read configuration
    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
     fixed_parameter_list, param_grid_list, featureset_names, learners,
     prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map,
     custom_learner_path) = _parse_config_file(config_file)

    # Check if we have gridmap
    if not local and not _HAVE_GRIDMAP:
        local = True
        logger.warning('gridmap 0.10.1+ not available. Forcing local '
                       'mode.  To run things on a DRMAA-compatible '
                       'cluster, install gridmap>=0.10.1 via pip.')

    # if performing ablation, expand featuresets to include combinations of
    # features within those sets
    if ablation is None or ablation > 0:
        # Make new feature set lists so that we can iterate without issue
        expanded_fs = []
        expanded_fs_names = []
        for features, featureset_name in zip(featuresets, featureset_names):
            features = sorted(features)
            featureset = set(features)
            # Expand to all feature combinations if ablation is None
            if ablation is None:
                for i in range(1, len(features)):
                    for excluded_features in combinations(features, i):
                        expanded_fs.append(sorted(featureset -
                                                  set(excluded_features)))
                        expanded_fs_names.append(
                            featureset_name +
                            '_minus_' +
                            _munge_featureset_name(excluded_features))
            # Otherwise, just expand removing the specified number at a time
            else:
                for excluded_features in combinations(features, ablation):
                    expanded_fs.append(sorted(featureset -
                                              set(excluded_features)))
                    expanded_fs_names.append(
                        featureset_name +
                        '_minus_' +
                        _munge_featureset_name(excluded_features))
            # Also add version with nothing removed as baseline
            expanded_fs.append(features)
            expanded_fs_names.append(featureset_name + '_all')

        # Replace original feature set lists
        featuresets = expanded_fs
        featureset_names = expanded_fs_names
    elif ablation < 0:
        raise ValueError('Value for "ablation" argument must be either '
                         'positive integer or None.')

    # the list of jobs submitted (if running on grid)
    if not local:
        jobs = []

    # the list to hold the paths to all the result json files
    result_json_paths = []

    # check if the length of the featureset_name exceeds the maximum length
    # allowed
    for featureset_name in featureset_names:
        if len(featureset_name) > 210:
            raise OSError('System generated file length "{}" exceeds the '
                          'maximum length supported.  Please specify names of '
                          'your datasets with "featureset_names".  If you are '
                          'running ablation experiment, please reduce the '
                          'length of the features in "featuresets" because the'
                          ' auto-generated name would be longer than the file '
                          'system can handle'.format(featureset_name))

    # Run each featureset-learner combination
    for featureset, featureset_name in zip(featuresets, featureset_names):
        for learner_num, learner_name in enumerate(learners):

            # for the individual job name, we need to add the feature set name
            # and the learner name
            job_name_components = [experiment_name, featureset_name,
                                   learner_name]
            job_name = '_'.join(job_name_components)

            # change the prediction prefix to include the feature set
            prediction_prefix = join(prediction_dir, job_name)

            # the log file that stores the actual output of this script (e.g.,
            # the tuned parameters, what kind of experiment was run, etc.)
            temp_logfile = join(log_path, '{}.log'.format(job_name))

            # Figure out result json file path
            result_json_path = join(results_path,
                                    '{}.results.json'.format(job_name))

            # save the path to the results json file that will be written
            result_json_paths.append(result_json_path)

            # If result file already exists and we're resuming, move on
            if resume and (exists(result_json_path) and
                           os.path.getsize(result_json_path)):
                logger.info('Running in resume mode and %s exists, so skipping'
                            ' job.', result_json_path)
                continue

            # create job if we're doing things on the grid
            job_args = {}
            job_args["experiment_name"] = experiment_name
            job_args["task"] = task
            job_args["sampler"] = sampler
            job_args["feature_hasher"] = feature_hasher
            job_args["hasher_features"] = hasher_features
            job_args["job_name"] = job_name
            job_args["featureset"] = featureset
            job_args["featureset_name"] = featureset_name
            job_args["learner_name"] = learner_name
            job_args["train_path"] = train_path
            job_args["test_path"] = test_path
            job_args["train_set_name"] = train_set_name
            job_args["test_set_name"] = test_set_name
            job_args["shuffle"] = do_shuffle
            job_args["model_path"] = model_path
            job_args["prediction_prefix"] = prediction_prefix
            job_args["grid_search"] = do_grid_search
            job_args["grid_objective"] = grid_objective
            job_args["suffix"] = suffix
            job_args["log_path"] = temp_logfile
            job_args["probability"] = probability
            job_args["results_path"] = results_path
            job_args["sampler_parameters"] = (fixed_sampler_parameters
                                              if fixed_sampler_parameters
                                              else dict())
            job_args["fixed_parameters"] = (fixed_parameter_list[learner_num]
                                            if fixed_parameter_list
                                            else dict())
            job_args["param_grid"] = (param_grid_list[learner_num]
                                      if param_grid_list else None)
            job_args["pos_label_str"] = pos_label_str
            job_args["overwrite"] = overwrite
            job_args["feature_scaling"] = feature_scaling
            job_args["min_feature_count"] = min_feature_count
            job_args["grid_search_jobs"] = grid_search_jobs
            job_args["grid_search_folds"] = grid_search_folds
            job_args["cv_folds"] = cv_folds
            job_args["do_stratified_folds"] = do_stratified_folds
            job_args["label_col"] = label_col
            job_args["id_col"] = id_col
            job_args["ids_to_floats"] = ids_to_floats
            job_args["quiet"] = quiet
            job_args["class_map"] = class_map
            job_args["custom_learner_path"] = custom_learner_path

            if not local:
                jobs.append(Job(_classify_featureset, [job_args],
                                num_slots=(MAX_CONCURRENT_PROCESSES if
                                           do_grid_search else 1),
                                name=job_name, queue=queue))
            else:
                _classify_featureset(job_args)
    test_set_name = basename(test_path)

    # submit the jobs (if running on grid)
    if not local and _HAVE_GRIDMAP:
        if log_path:
            job_results = process_jobs(jobs, white_list=hosts,
                                       temp_dir=log_path)
        else:
            job_results = process_jobs(jobs, white_list=hosts)
        _check_job_results(job_results)

    # write out the summary results file
    if (task == 'cross_validate' or task == 'evaluate') and write_summary:
        summary_file_name = experiment_name + '_summary.tsv'
        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
        with open(join(results_path, summary_file_name),
                  file_mode) as output_file:
            _write_summary_file(result_json_paths, output_file,
                                ablation=ablation)

    return result_json_paths
Beispiel #14
0
def check_config_parsing_value_error(config_path):
    """
    Assert that calling `_parse_config_file` on `config_path` raises ValueError
    """
    _parse_config_file(config_path)
Beispiel #15
0
def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
                      hosts=None, write_summary=True, quiet=False,
                      ablation=0, resume=False):
    """
    Takes a configuration file and runs the specified jobs on the grid.

    :param config_path: Path to the configuration file we would like to use.
    :type config_path: str
    :param local: Should this be run locally instead of on the cluster?
    :type local: bool
    :param overwrite: If the model files already exist, should we overwrite
                      them instead of re-using them?
    :type overwrite: bool
    :param queue: The DRMAA queue to use if we're running on the cluster.
    :type queue: str
    :param hosts: If running on the cluster, these are the machines we should
                  use.
    :type hosts: list of str
    :param write_summary: Write a tsv file with a summary of the results.
    :type write_summary: bool
    :param quiet: Suppress printing of "Loading..." messages.
    :type quiet: bool
    :param ablation: Number of features to remove when doing an ablation
                     experiment. If positive, we will perform repeated ablation
                     runs for all combinations of features removing the
                     specified number at a time. If ``None``, we will use all
                     combinations of all lengths. If 0, the default, no
                     ablation is performed. If negative, a ``ValueError`` is
                     raised.
    :type ablation: int or None
    :param resume: If result files already exist for an experiment, do not
                   overwrite them. This is very useful when doing a large
                   ablation experiment and part of it crashes.
    :type resume: bool

    :return: A list of paths to .json results files for each variation in the
             experiment.
    :rtype: list of str

    """
    # Initialize logger
    logger = logging.getLogger(__name__)

    # Read configuration
    (experiment_name, task, sampler, fixed_sampler_parameters, feature_hasher,
     hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
     featuresets, do_shuffle, model_path, do_grid_search, grid_objectives,
     probability, results_path, pos_label_str, feature_scaling,
     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
     do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names,
     learners, prediction_dir, log_path, train_path, test_path, ids_to_floats,
     class_map, custom_learner_path) = _parse_config_file(config_file)

    # Check if we have gridmap
    if not local and not _HAVE_GRIDMAP:
        local = True
        logger.warning('gridmap 0.10.1+ not available. Forcing local '
                       'mode.  To run things on a DRMAA-compatible '
                       'cluster, install gridmap>=0.10.1 via pip.')

    # if performing ablation, expand featuresets to include combinations of
    # features within those sets
    if ablation is None or ablation > 0:
        # Make new feature set lists so that we can iterate without issue
        expanded_fs = []
        expanded_fs_names = []
        for features, featureset_name in zip(featuresets, featureset_names):
            features = sorted(features)
            featureset = set(features)
            # Expand to all feature combinations if ablation is None
            if ablation is None:
                for i in range(1, len(features)):
                    for excluded_features in combinations(features, i):
                        expanded_fs.append(sorted(featureset -
                                                  set(excluded_features)))
                        expanded_fs_names.append(
                            featureset_name +
                            '_minus_' +
                            _munge_featureset_name(excluded_features))
            # Otherwise, just expand removing the specified number at a time
            else:
                for excluded_features in combinations(features, ablation):
                    expanded_fs.append(sorted(featureset -
                                              set(excluded_features)))
                    expanded_fs_names.append(
                        featureset_name +
                        '_minus_' +
                        _munge_featureset_name(excluded_features))
            # Also add version with nothing removed as baseline
            expanded_fs.append(features)
            expanded_fs_names.append(featureset_name + '_all')

        # Replace original feature set lists
        featuresets = expanded_fs
        featureset_names = expanded_fs_names
    elif ablation < 0:
        raise ValueError('Value for "ablation" argument must be either '
                         'positive integer or None.')

    # the list of jobs submitted (if running on grid)
    if not local:
        jobs = []

    # the list to hold the paths to all the result json files
    result_json_paths = []

    # check if the length of the featureset_name exceeds the maximum length
    # allowed
    for featureset_name in featureset_names:
        if len(featureset_name) > 210:
            raise OSError('System generated file length "{}" exceeds the '
                          'maximum length supported.  Please specify names of '
                          'your datasets with "featureset_names".  If you are '
                          'running ablation experiment, please reduce the '
                          'length of the features in "featuresets" because the'
                          ' auto-generated name would be longer than the file '
                          'system can handle'.format(featureset_name))

    # Run each featureset-learner combination
    for featureset, featureset_name in zip(featuresets, featureset_names):
        for learner_num, learner_name in enumerate(learners):
            for grid_objective in grid_objectives:

                # for the individual job name, we need to add the feature set name
                # and the learner name
                if len(grid_objectives) == 1:
                    job_name_components = [experiment_name, featureset_name,
                                           learner_name]
                else:
                    job_name_components = [experiment_name, featureset_name,
                                           learner_name, grid_objective]

                job_name = '_'.join(job_name_components)

                # change the prediction prefix to include the feature set
                prediction_prefix = join(prediction_dir, job_name)

                # the log file that stores the actual output of this script (e.g.,
                # the tuned parameters, what kind of experiment was run, etc.)
                temp_logfile = join(log_path, '{}.log'.format(job_name))

                # Figure out result json file path
                result_json_path = join(results_path,
                                        '{}.results.json'.format(job_name))

                # save the path to the results json file that will be written
                result_json_paths.append(result_json_path)

                # If result file already exists and we're resuming, move on
                if resume and (exists(result_json_path) and
                               os.path.getsize(result_json_path)):
                    logger.info('Running in resume mode and %s exists, '
                                'so skipping job.', result_json_path)
                    continue

                # create job if we're doing things on the grid
                job_args = {}
                job_args["experiment_name"] = experiment_name
                job_args["task"] = task
                job_args["sampler"] = sampler
                job_args["feature_hasher"] = feature_hasher
                job_args["hasher_features"] = hasher_features
                job_args["job_name"] = job_name
                job_args["featureset"] = featureset
                job_args["featureset_name"] = featureset_name
                job_args["learner_name"] = learner_name
                job_args["train_path"] = train_path
                job_args["test_path"] = test_path
                job_args["train_set_name"] = train_set_name
                job_args["test_set_name"] = test_set_name
                job_args["shuffle"] = do_shuffle
                job_args["model_path"] = model_path
                job_args["prediction_prefix"] = prediction_prefix
                job_args["grid_search"] = do_grid_search
                job_args["grid_objective"] = grid_objective
                job_args["suffix"] = suffix
                job_args["log_path"] = temp_logfile
                job_args["probability"] = probability
                job_args["results_path"] = results_path
                job_args["sampler_parameters"] = (fixed_sampler_parameters
                                                  if fixed_sampler_parameters
                                                  else dict())
                job_args["fixed_parameters"] = (fixed_parameter_list[learner_num]
                                                if fixed_parameter_list
                                                else dict())
                job_args["param_grid"] = (param_grid_list[learner_num]
                                          if param_grid_list else None)
                job_args["pos_label_str"] = pos_label_str
                job_args["overwrite"] = overwrite
                job_args["feature_scaling"] = feature_scaling
                job_args["min_feature_count"] = min_feature_count
                job_args["grid_search_jobs"] = grid_search_jobs
                job_args["grid_search_folds"] = grid_search_folds
                job_args["cv_folds"] = cv_folds
                job_args["save_cv_folds"] = save_cv_folds
                job_args["do_stratified_folds"] = do_stratified_folds
                job_args["label_col"] = label_col
                job_args["id_col"] = id_col
                job_args["ids_to_floats"] = ids_to_floats
                job_args["quiet"] = quiet
                job_args["class_map"] = class_map
                job_args["custom_learner_path"] = custom_learner_path

                if not local:
                    jobs.append(Job(_classify_featureset, [job_args],
                                    num_slots=(MAX_CONCURRENT_PROCESSES if
                                               do_grid_search else 1),
                                    name=job_name, queue=queue))
                else:
                    _classify_featureset(job_args)
    test_set_name = basename(test_path)

    # submit the jobs (if running on grid)
    if not local and _HAVE_GRIDMAP:
        if log_path:
            job_results = process_jobs(jobs, white_list=hosts,
                                       temp_dir=log_path)
        else:
            job_results = process_jobs(jobs, white_list=hosts)
        _check_job_results(job_results)

    # write out the summary results file
    if (task == 'cross_validate' or task == 'evaluate') and write_summary:
        summary_file_name = experiment_name + '_summary.tsv'
        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
        with open(join(results_path, summary_file_name),
                  file_mode) as output_file:
            _write_summary_file(result_json_paths, output_file,
                                ablation=ablation)

    return result_json_paths
Beispiel #16
0
def check_config_parsing_type_error(config_path):
    """
    Assert that calling `_parse_config_file` on `config_path` raises TypeError
    """
    _parse_config_file(config_path)
Beispiel #17
0
def check_config_parsing_key_error(config_path):
    """
    Assert that calling `_parse_config_file` on `config_path` raises KeyError
    """
    _parse_config_file(config_path)
Beispiel #18
0
def test_empty_config_name_raises_file_not_found_error():
    """ 
    Assert that calling _parse_config_file on an empty string raises IOError
    """
    _parse_config_file("")
Beispiel #19
0
def test_empty_config_name_raises_file_not_found_error():
    """
    Assert that calling _parse_config_file on an empty string raises IOError
    """
    _parse_config_file("")