Ejemplo n.º 1
0
def _check_job_results(job_results):
    """
    See if we have a complete results dictionary for every job.

    Parameters
    ----------
    job_results : list of dicts
        A list of job result dictionaries.
    """
    logger = get_skll_logger('experiment')
    logger.info('Checking job results')
    for result_dicts in job_results:
        if not result_dicts or 'task' not in result_dicts[0]:
            logger.error('There was an error running the experiment:\n%s',
                         result_dicts)
Ejemplo n.º 2
0
def _write_summary_file(result_json_paths, output_file, ablation=0):
    """
    Function to take a list of paths to individual result
    json files and returns a single file that summarizes
    all of them.

    Parameters
    ----------
    result_json_paths : list of str
        A list of paths to the individual result JSON files.
    output_file : str
        The path to the output file (TSV format).
    ablation : int, optional
        The number of features to remove when doing ablation experiment.
        Defaults to 0.
    """
    learner_result_dicts = []
    # Map from feature set names to all features in them
    all_features = defaultdict(set)
    logger = get_skll_logger('experiment')
    for json_path in result_json_paths:
        if not exists(json_path):
            logger.error(('JSON results file %s not found. Skipping summary '
                          'creation. You can manually create the summary file'
                          ' after the fact by using the summarize_results '
                          'script.'), json_path)
            return
        else:
            with open(json_path, 'r') as json_file:
                obj = json.load(json_file)
                featureset_name = obj[0]['featureset_name']
                if ablation != 0 and '_minus_' in featureset_name:
                    parent_set = featureset_name.split('_minus_', 1)[0]
                    all_features[parent_set].update(
                        yaml.safe_load(obj[0]['featureset']))
                learner_result_dicts.extend(obj)

    # Build and write header
    header = set(learner_result_dicts[0].keys()) - {'result_table',
                                                    'descriptive'}
    if ablation != 0:
        header.add('ablated_features')
    header = sorted(header)
    writer = csv.DictWriter(output_file,
                            header,
                            extrasaction='ignore',
                            dialect=csv.excel_tab)
    writer.writeheader()

    # Build "ablated_features" list and fix some backward compatible things
    for lrd in learner_result_dicts:
        featureset_name = lrd['featureset_name']
        if ablation != 0:
            parent_set = featureset_name.split('_minus_', 1)[0]
            ablated_features = all_features[parent_set].difference(
                yaml.safe_load(lrd['featureset']))
            lrd['ablated_features'] = ''
            if ablated_features:
                lrd['ablated_features'] = json.dumps(sorted(ablated_features))

        # write out the new learner dict with the readable fields
        writer.writerow(lrd)

    output_file.flush()
Ejemplo n.º 3
0
def _write_learning_curve_file(result_json_paths, output_file):
    """
    Function to take a list of paths to individual learning curve
    results json files and writes out a single TSV file with the
    learning curve data.

    Parameters
    ----------
    result_json_paths : list of str
        A list of paths to the individual result JSON files.
    output_file : str
        The path to the output file (TSV format).
    """

    learner_result_dicts = []

    # Map from feature set names to all features in them
    logger = get_skll_logger('experiment')
    for json_path in result_json_paths:
        if not exists(json_path):
            logger.error(('JSON results file %s not found. Skipping summary '
                          'creation. You can manually create the summary file'
                          ' after the fact by using the summarize_results '
                          'script.'), json_path)
            return
        else:
            with open(json_path, 'r') as json_file:
                obj = json.load(json_file)
                learner_result_dicts.extend(obj)

    # Build and write header
    header = ['featureset_name', 'learner_name', 'metric',
              'train_set_name', 'training_set_size', 'train_score_mean',
              'test_score_mean', 'train_score_std', 'test_score_std',
              'scikit_learn_version', 'version']
    writer = csv.DictWriter(output_file,
                            header,
                            extrasaction='ignore',
                            dialect=csv.excel_tab)
    writer.writeheader()

    # write out the fields we need for the learning curve file
    # specifically, we need to separate out the curve sizes
    # and scores into individual entries.
    for lrd in learner_result_dicts:
        training_set_sizes = lrd['computed_curve_train_sizes']
        train_scores_means_by_size = lrd['learning_curve_train_scores_means']
        test_scores_means_by_size = lrd['learning_curve_test_scores_means']
        train_scores_stds_by_size = lrd['learning_curve_train_scores_stds']
        test_scores_stds_by_size = lrd['learning_curve_test_scores_stds']

        # rename `grid_objective` to `metric` since the latter name can be confusing
        lrd['metric'] = lrd['grid_objective']

        for (size,
             train_score_mean,
             test_score_mean,
             train_score_std,
             test_score_std) in zip(training_set_sizes,
                                    train_scores_means_by_size,
                                    test_scores_means_by_size,
                                    train_scores_stds_by_size,
                                    test_scores_stds_by_size):
            lrd['training_set_size'] = size
            lrd['train_score_mean'] = train_score_mean
            lrd['test_score_mean'] = test_score_mean
            lrd['train_score_std'] = train_score_std
            lrd['test_score_std'] = test_score_std

            writer.writerow(lrd)

    output_file.flush()
Ejemplo n.º 4
0
def _classify_featureset(args):
    """
    Classification job to be submitted to grid.

    Parameters
    ----------
    args : dict
        A dictionary with arguments for classifying the
        ``FeatureSet`` instance.

    Returns
    -------
    res : list of dicts
        The results of the classification, in the format
        of a list of dictionaries.

    Raises
    ------
    ValueError
        If extra unknown arguments are passed to the function.
    """

    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)

    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    sampler = args.pop("sampler")
    feature_hasher = args.pop("feature_hasher")
    hasher_features = args.pop("hasher_features")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    featureset_name = args.pop("featureset_name")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    shuffle = args.pop('shuffle')
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    output_metrics = args.pop("output_metrics")
    suffix = args.pop("suffix")
    job_log_file = args.pop("log_file")
    job_log_level = args.pop("log_level")
    probability = args.pop("probability")
    pipeline = args.pop("pipeline")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    sampler_parameters = args.pop("sampler_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    folds_file = args.pop("folds_file")
    grid_search_jobs = args.pop("grid_search_jobs")
    grid_search_folds = args.pop("grid_search_folds")
    cv_folds = args.pop("cv_folds")
    save_cv_folds = args.pop("save_cv_folds")
    save_cv_models = args.pop("save_cv_models")
    use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search")
    stratified_folds = args.pop("do_stratified_folds")
    label_col = args.pop("label_col")
    id_col = args.pop("id_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    custom_learner_path = args.pop("custom_learner_path")
    custom_metric_path = args.pop("custom_metric_path")
    quiet = args.pop('quiet', False)
    learning_curve_cv_folds = args.pop("learning_curve_cv_folds")
    learning_curve_train_sizes = args.pop("learning_curve_train_sizes")

    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: "
                          "{}").format(args.keys()))
    start_timestamp = datetime.datetime.now()

    # create a new SKLL logger for this specific job and
    # use the given log level
    logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level)

    try:

        # log messages
        logger.info("Task: {}".format(task))

        # check if we have any possible custom metrics
        possible_custom_metric_names = []
        for metric_name in output_metrics + [grid_objective]:
            # metrics that are not in `SCORERS` or `None` are candidates
            # (the `None` is a by-product of how jobs with single tuning
            # objectives are created)
            if metric_name not in SCORERS and metric_name is not None:
                possible_custom_metric_names.append(metric_name)
            # if the metric is already in `SCORERS`, is it a custom one
            # that we already registered? if so, log that
            elif metric_name in _CUSTOM_METRICS:
                logger.info(
                    f"custom metric '{metric_name}' is already registered")

        # initialize list that will hold any invalid metrics
        # that we could not register as custom metrics
        invalid_metric_names = []

        # if we have possible custom metrics
        if possible_custom_metric_names:

            # check that we have a file to load them from
            if not custom_metric_path:
                raise ValueError(
                    f"invalid metrics specified: {possible_custom_metric_names}"
                )
            else:
                # try to register each possible custom metric
                # raise an exception if we fail, if we don't then
                # add the custom metric function to `globals()` so
                # that it serializes properly for gridmap
                for custom_metric_name in possible_custom_metric_names:
                    try:
                        custom_metric_func = register_custom_metric(
                            custom_metric_path, custom_metric_name)
                    except (AttributeError, NameError, ValueError):
                        invalid_metric_names.append(custom_metric_name)
                    else:
                        logger.info(f"registered '{custom_metric_name}' as a "
                                    f"custom metric")
                        globals()[custom_metric_name] = custom_metric_func

        # raise an error if we have any invalid metrics
        if invalid_metric_names:
            raise ValueError(
                f"invalid metrics specified: {invalid_metric_names}. "
                f"If these are custom metrics, check the function "
                f"names.")

        if task == 'cross_validate':
            if isinstance(cv_folds, int):
                num_folds = cv_folds
            else:  # folds_file was used, so count the unique fold ids.
                num_folds = len(set(cv_folds.values()))
            logger.info("Cross-validating ({} folds) on {}, feature "
                        "set {} ...".format(num_folds, train_set_name,
                                            featureset))
        elif task == 'evaluate':
            logger.info("Training on {}, Test on {}, "
                        "feature set {} ...".format(train_set_name,
                                                    test_set_name, featureset))
        elif task == 'train':
            logger.info("Training on {}, feature set {} ...".format(
                train_set_name, featureset))
        elif task == 'learning_curve':
            logger.info("Generating learning curve "
                        "({} 80/20 folds, sizes={}, objective={}) on {}, "
                        "feature set {} ...".format(
                            learning_curve_cv_folds,
                            learning_curve_train_sizes, grid_objective,
                            train_set_name, featureset))
        else:  # predict
            logger.info("Training on {}, Making predictions on {}, "
                        "feature set {} ...".format(train_set_name,
                                                    test_set_name, featureset))

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = join(model_path, '{}.model'.format(job_name))
        if (task in ['cross_validate', 'learning_curve']
                or not exists(modelfile) or overwrite):
            train_examples = load_featureset(train_path,
                                             featureset,
                                             suffix,
                                             label_col=label_col,
                                             id_col=id_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet,
                                             class_map=class_map,
                                             feature_hasher=feature_hasher,
                                             num_features=hasher_features,
                                             logger=logger)

            train_set_size = len(train_examples.ids)
            if not train_examples.has_labels:
                raise ValueError('Training examples do not have labels')
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              pipeline=pipeline,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count,
                              sampler=sampler,
                              sampler_kwargs=sampler_parameters,
                              custom_learner_path=custom_learner_path,
                              logger=logger)

        # load the model if it already exists
        else:
            # import custom learner into global namespace if we are reusing
            # a saved model
            if custom_learner_path:
                globals()[learner_name] = load_custom_learner(
                    custom_learner_path, learner_name)
            train_set_size = 'unknown'
            if exists(modelfile) and not overwrite:
                logger.info("Loading pre-existing {} model: {}".format(
                    learner_name, modelfile))
            learner = Learner.from_file(modelfile)

            # attach the job logger to this learner
            learner.logger = logger

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = load_featureset(test_path,
                                            featureset,
                                            suffix,
                                            label_col=label_col,
                                            id_col=id_col,
                                            ids_to_floats=ids_to_floats,
                                            quiet=quiet,
                                            class_map=class_map,
                                            feature_hasher=feature_hasher,
                                            num_features=hasher_features)
            test_set_size = len(test_examples.ids)
        else:
            test_set_size = 'n/a'

        # compute information about xval and grid folds that can be put in results
        # in readable form
        if isinstance(cv_folds, dict):
            cv_folds_to_print = '{} via folds file'.format(
                len(set(cv_folds.values())))
        else:
            cv_folds_to_print = str(cv_folds)

        if isinstance(grid_search_folds, dict):
            grid_search_folds_to_print = \
                '{} via folds file'.format(len(set(grid_search_folds.values())))
        else:
            grid_search_folds_to_print = str(grid_search_folds)

        # create a list of dictionaries of the results information
        learner_result_dict_base = {
            'experiment_name':
            experiment_name,
            'train_set_name':
            train_set_name,
            'train_set_size':
            train_set_size,
            'test_set_name':
            test_set_name,
            'test_set_size':
            test_set_size,
            'featureset':
            json.dumps(featureset),
            'featureset_name':
            featureset_name,
            'shuffle':
            shuffle,
            'learner_name':
            learner_name,
            'task':
            task,
            'start_timestamp':
            start_timestamp.strftime('%d %b %Y %H:%M:'
                                     '%S.%f'),
            'version':
            __version__,
            'feature_scaling':
            feature_scaling,
            'folds_file':
            folds_file,
            'grid_search':
            grid_search,
            'grid_objective':
            grid_objective,
            'grid_search_folds':
            grid_search_folds_to_print,
            'min_feature_count':
            min_feature_count,
            'cv_folds':
            cv_folds_to_print,
            'using_folds_file':
            isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict),
            'save_cv_folds':
            save_cv_folds,
            'save_cv_models':
            save_cv_models,
            'use_folds_file_for_grid_search':
            use_folds_file_for_grid_search,
            'stratified_folds':
            stratified_folds,
            'scikit_learn_version':
            SCIKIT_VERSION
        }

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            logger.info('Cross-validating')
            (
                task_results, grid_scores, grid_search_cv_results_dicts,
                skll_fold_ids, models
            ) = learner.cross_validate(
                train_examples,
                shuffle=shuffle,
                stratified=stratified_folds,
                prediction_prefix=prediction_prefix,
                grid_search=grid_search,
                grid_search_folds=grid_search_folds,
                cv_folds=cv_folds,
                grid_objective=grid_objective,
                output_metrics=output_metrics,
                param_grid=param_grid,
                grid_jobs=grid_search_jobs,
                save_cv_folds=save_cv_folds,
                save_cv_models=save_cv_models,
                use_custom_folds_for_grid_search=use_folds_file_for_grid_search
            )
            if models:
                for index, m in enumerate(models, start=1):
                    modelfile = join(model_path,
                                     '{}_fold{}.model'.format(job_name, index))
                    m.save(modelfile)
        elif task == 'learning_curve':
            logger.info("Generating learning curve(s)")
            (curve_train_scores, curve_test_scores,
             computed_curve_train_sizes) = learner.learning_curve(
                 train_examples,
                 grid_objective,
                 cv_folds=learning_curve_cv_folds,
                 train_sizes=learning_curve_train_sizes)
        else:
            # if we have do not have a saved model, we need to train one.
            if not exists(modelfile) or overwrite:
                logger.info("Featurizing and training new {} model".format(
                    learner_name))

                (best_score, grid_search_cv_results) = learner.train(
                    train_examples,
                    shuffle=shuffle,
                    grid_search=grid_search,
                    grid_search_folds=grid_search_folds,
                    grid_objective=grid_objective,
                    param_grid=param_grid,
                    grid_jobs=grid_search_jobs)
                grid_scores = [best_score]
                grid_search_cv_results_dicts = [grid_search_cv_results]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    logger.info("Best {} grid search score: {}".format(
                        grid_objective, round(best_score, 3)))
            else:
                grid_scores = [None]
                grid_search_cv_results_dicts = [None]

            # print out the parameters
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         learner.model.get_params().items())
            logger.info("Hyperparameters: {}".format(', '.join(param_out)))

            # run on test set or cross-validate on training data,
            # depending on what was asked for
            if task == 'evaluate':
                logger.info("Evaluating predictions")
                task_results = [
                    learner.evaluate(test_examples,
                                     prediction_prefix=prediction_prefix,
                                     grid_objective=grid_objective,
                                     output_metrics=output_metrics)
                ]
            elif task == 'predict':
                logger.info("Writing predictions")
                # we set `class_labels` to `False` so that if the learner is
                # probabilistic, probabilities are written instead of labels
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix,
                                class_labels=False)
            # do nothing here for train

        end_timestamp = datetime.datetime.now()
        learner_result_dict_base['end_timestamp'] = end_timestamp.strftime(
            '%d %b %Y %H:%M:%S.%f')
        total_time = end_timestamp - start_timestamp
        learner_result_dict_base['total_time'] = str(total_time)

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               grid_search_cv_results_dicts,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            with open(results_json_path, 'w') as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

            with open(join(results_path, '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)

        elif task == 'learning_curve':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = {}
            res.update(learner_result_dict_base)
            res.update({
                'learning_curve_cv_folds':
                learning_curve_cv_folds,
                'given_curve_train_sizes':
                learning_curve_train_sizes,
                'learning_curve_train_scores_means':
                np.mean(curve_train_scores, axis=1),
                'learning_curve_test_scores_means':
                np.mean(curve_test_scores, axis=1),
                'learning_curve_train_scores_stds':
                np.std(curve_train_scores, axis=1, ddof=1),
                'learning_curve_test_scores_stds':
                np.std(curve_test_scores, axis=1, ddof=1),
                'computed_curve_train_sizes':
                computed_curve_train_sizes
            })

            # we need to return and write out a list of dictionaries
            res = [res]

            # write out the result dictionary to a json file
            with open(results_json_path, 'w') as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

        # For all other tasks, i.e. train or predict
        else:
            if results_path:
                results_json_path = join(results_path,
                                         '{}.results.json'.format(job_name))

                assert len(grid_scores) == 1
                assert len(grid_search_cv_results_dicts) == 1
                grid_search_cv_results_dict = {"grid_score": grid_scores[0]}
                grid_search_cv_results_dict["grid_search_cv_results"] = \
                    grid_search_cv_results_dicts[0]
                grid_search_cv_results_dict.update(learner_result_dict_base)
                # write out the result dictionary to a json file
                with open(results_json_path, 'w') as json_file:
                    json.dump(grid_search_cv_results_dict,
                              json_file,
                              cls=NumpyTypeEncoder)
            res = [learner_result_dict_base]

        # write out the cv folds if required
        if task == 'cross_validate' and save_cv_folds:
            skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
            with open(join(results_path, skll_fold_ids_file),
                      'w') as output_file:
                _write_skll_folds(skll_fold_ids, output_file)

    finally:
        close_and_remove_logger_handlers(logger)

    return res
Ejemplo n.º 5
0
def run_configuration(config_file,
                      local=False,
                      overwrite=True,
                      queue='all.q',
                      hosts=None,
                      write_summary=True,
                      quiet=False,
                      ablation=0,
                      resume=False,
                      log_level=logging.INFO):
    """
    Takes a configuration file and runs the specified jobs on the grid.

    Parameters
    ----------
    config_file : str
        Path to the configuration file we would like to use.
    local : bool, optional
        Should this be run locally instead of on the cluster?
        Defaults to ``False``.
    overwrite : bool, optional
        If the model files already exist, should we overwrite
        them instead of re-using them?
        Defaults to ``True``.
    queue : str, optional
        The DRMAA queue to use if we're running on the cluster.
        Defaults to ``'all.q'``.
    hosts : list of str, optional
        If running on the cluster, these are the machines we should use.
        Defaults to ``None``.
    write_summary : bool, optional
        Write a TSV file with a summary of the results.
        Defaults to ``True``.
    quiet : bool, optional
        Suppress printing of "Loading..." messages.
        Defaults to ``False``.
    ablation : int, optional
        Number of features to remove when doing an ablation
        experiment. If positive, we will perform repeated ablation
        runs for all combinations of features removing the
        specified number at a time. If ``None``, we will use all
        combinations of all lengths. If 0, the default, no
        ablation is performed. If negative, a ``ValueError`` is
        raised.
        Defaults to 0.
    resume : bool, optional
        If result files already exist for an experiment, do not
        overwrite them. This is very useful when doing a large
        ablation experiment and part of it crashes.
        Defaults to ``False``.
    log_level : str, optional
        The level for logging messages.
        Defaults to ``logging.INFO``.

    Returns
    -------
    result_json_paths : list of str
        A list of paths to .json results files for each variation in the
        experiment.

    Raises
    ------
    ValueError
        If value for ``"ablation"`` is not a positive int or ``None``.
    OSError
        If the lenth of the ``FeatureSet`` name > 210.
    """

    try:

        # Read configuration
        (experiment_name, task, sampler, fixed_sampler_parameters,
         feature_hasher, hasher_features, id_col, label_col, train_set_name,
         test_set_name, suffix, featuresets, do_shuffle, model_path,
         do_grid_search, grid_objectives, probability, pipeline, results_path,
         pos_label_str, feature_scaling, min_feature_count, folds_file,
         grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
         save_cv_models, use_folds_file_for_grid_search, do_stratified_folds,
         fixed_parameter_list, param_grid_list, featureset_names, learners,
         prediction_dir, log_path, train_path, test_path, ids_to_floats,
         class_map, custom_learner_path, custom_metric_path,
         learning_curve_cv_folds_list, learning_curve_train_sizes,
         output_metrics) = parse_config_file(config_file, log_level=log_level)

        # get the main experiment logger that will already have been
        # created by the configuration parser so we don't need anything
        # except the name `experiment`.
        logger = get_skll_logger('experiment')

        # Check if we have gridmap
        if not local and not _HAVE_GRIDMAP:
            local = True
            logger.warning('gridmap 0.10.1+ not available. Forcing local '
                           'mode.  To run things on a DRMAA-compatible '
                           'cluster, install gridmap>=0.10.1 via pip.')

        # No grid search or ablation for learning curve generation
        if task == 'learning_curve':
            if ablation is None or ablation > 0:
                ablation = 0
                logger.warning("Ablating features is not supported during "
                               "learning curve generation. Ignoring.")

        # if we just had a train file and a test file, there are no real featuresets
        # in which case there are no features to ablate
        if len(featuresets) == 1 and len(featuresets[0]) == 1:
            if ablation is None or ablation > 0:
                ablation = 0
                logger.warning(
                    "Not enough featuresets for ablation. Ignoring.")

        # if performing ablation, expand featuresets to include combinations of
        # features within those sets
        if ablation is None or ablation > 0:
            # Make new feature set lists so that we can iterate without issue
            expanded_fs = []
            expanded_fs_names = []
            for features, featureset_name in zip(featuresets,
                                                 featureset_names):
                features = sorted(features)
                featureset = set(features)
                # Expand to all feature combinations if ablation is None
                if ablation is None:
                    for i in range(1, len(features)):
                        for excluded_features in combinations(features, i):
                            expanded_fs.append(
                                sorted(featureset - set(excluded_features)))
                            expanded_fs_names.append(
                                featureset_name + '_minus_' +
                                _munge_featureset_name(excluded_features))
                # Otherwise, just expand removing the specified number at a time
                else:
                    for excluded_features in combinations(features, ablation):
                        expanded_fs.append(
                            sorted(featureset - set(excluded_features)))
                        expanded_fs_names.append(
                            featureset_name + '_minus_' +
                            _munge_featureset_name(excluded_features))
                # Also add version with nothing removed as baseline
                expanded_fs.append(features)
                expanded_fs_names.append(featureset_name + '_all')

            # Replace original feature set lists
            featuresets = expanded_fs
            featureset_names = expanded_fs_names
        elif ablation < 0:
            raise ValueError('Value for "ablation" argument must be either '
                             'positive integer or None.')

        # the list of jobs submitted (if running on grid)
        if not local:
            jobs = []

        # the list to hold the paths to all the result json files
        result_json_paths = []

        # check if the length of the featureset_name exceeds the maximum length
        # allowed
        for featureset_name in featureset_names:
            if len(featureset_name) > 210:
                raise OSError(
                    'System generated file length "{}" exceeds the '
                    'maximum length supported.  Please specify names of '
                    'your datasets with "featureset_names".  If you are '
                    'running ablation experiment, please reduce the '
                    'length of the features in "featuresets" because the'
                    ' auto-generated name would be longer than the file '
                    'system can handle'.format(featureset_name))

        # if the task is learning curve, and ``metrics`` was specified, then
        # assign the value of ``metrics`` to ``grid_objectives`` - this lets
        # us piggyback on the parallelization of the objectives that is already
        # set up for us to use
        if task == 'learning_curve' and len(output_metrics) > 0:
            grid_objectives = output_metrics

        # if there were no grid objectives provided, just set it to
        # a list containing a single None so as to allow the parallelization
        # to proceeed and to pass the correct default value of grid_objective
        # down to _classify_featureset().
        if not grid_objectives:
            grid_objectives = [None]

        # Run each featureset-learner-objective combination
        for featureset, featureset_name in zip(featuresets, featureset_names):
            for learner_num, learner_name in enumerate(learners):
                for grid_objective in grid_objectives:

                    # for the individual job name, we need to add the feature set name
                    # and the learner name
                    if grid_objective is None or len(grid_objectives) == 1:
                        job_name_components = [
                            experiment_name, featureset_name, learner_name
                        ]
                    else:
                        job_name_components = [
                            experiment_name, featureset_name, learner_name,
                            grid_objective
                        ]

                    job_name = '_'.join(job_name_components)

                    # change the prediction prefix to include the feature set
                    prediction_prefix = join(prediction_dir, job_name)

                    # the log file that stores the actual output of this script (e.g.,
                    # the tuned parameters, what kind of experiment was run, etc.)
                    logfile = join(log_path, '{}.log'.format(job_name))

                    # Figure out result json file path
                    result_json_path = join(results_path,
                                            '{}.results.json'.format(job_name))

                    # save the path to the results json file that will be written
                    result_json_paths.append(result_json_path)

                    # If result file already exists and we're resuming, move on
                    if resume and (exists(result_json_path)
                                   and getsize(result_json_path)):
                        logger.info(
                            'Running in resume mode and %s exists, '
                            'so skipping job.', result_json_path)
                        continue

                    # create job if we're doing things on the grid
                    job_args = {}
                    job_args["experiment_name"] = experiment_name
                    job_args["task"] = task
                    job_args["sampler"] = sampler
                    job_args["feature_hasher"] = feature_hasher
                    job_args["hasher_features"] = hasher_features
                    job_args["job_name"] = job_name
                    job_args["featureset"] = featureset
                    job_args["featureset_name"] = featureset_name
                    job_args["learner_name"] = learner_name
                    job_args["train_path"] = train_path
                    job_args["test_path"] = test_path
                    job_args["train_set_name"] = train_set_name
                    job_args["test_set_name"] = test_set_name
                    job_args["shuffle"] = do_shuffle
                    job_args["model_path"] = model_path
                    job_args["prediction_prefix"] = prediction_prefix
                    job_args["grid_search"] = do_grid_search
                    job_args["grid_objective"] = grid_objective
                    job_args['output_metrics'] = output_metrics
                    job_args["suffix"] = suffix
                    job_args["log_file"] = logfile
                    job_args["log_level"] = log_level
                    job_args["probability"] = probability
                    job_args["pipeline"] = pipeline
                    job_args["results_path"] = results_path
                    job_args["sampler_parameters"] = (
                        fixed_sampler_parameters
                        if fixed_sampler_parameters else dict())
                    job_args["fixed_parameters"] = (
                        fixed_parameter_list[learner_num]
                        if fixed_parameter_list else dict())
                    job_args["param_grid"] = (param_grid_list[learner_num]
                                              if param_grid_list else None)
                    job_args["pos_label_str"] = pos_label_str
                    job_args["overwrite"] = overwrite
                    job_args["feature_scaling"] = feature_scaling
                    job_args["min_feature_count"] = min_feature_count
                    job_args["grid_search_jobs"] = grid_search_jobs
                    job_args["grid_search_folds"] = grid_search_folds
                    job_args["folds_file"] = folds_file
                    job_args["cv_folds"] = cv_folds
                    job_args["save_cv_folds"] = save_cv_folds
                    job_args["save_cv_models"] = save_cv_models
                    job_args[
                        "use_folds_file_for_grid_search"] = use_folds_file_for_grid_search
                    job_args["do_stratified_folds"] = do_stratified_folds
                    job_args["label_col"] = label_col
                    job_args["id_col"] = id_col
                    job_args["ids_to_floats"] = ids_to_floats
                    job_args["quiet"] = quiet
                    job_args["class_map"] = class_map
                    job_args["custom_learner_path"] = custom_learner_path
                    job_args["custom_metric_path"] = custom_metric_path
                    job_args[
                        "learning_curve_cv_folds"] = learning_curve_cv_folds_list[
                            learner_num]
                    job_args[
                        "learning_curve_train_sizes"] = learning_curve_train_sizes

                    if not local:
                        jobs.append(
                            Job(_classify_featureset, [job_args],
                                num_slots=(MAX_CONCURRENT_PROCESSES if
                                           (do_grid_search or task
                                            == 'learning_curve') else 1),
                                name=job_name,
                                queue=queue))
                    else:
                        _classify_featureset(job_args)

        # Call get_skll_logger again after _classify_featureset
        # calls are finished so that any warnings that may
        # happen after this point get correctly logged to the
        # main logger
        logger = get_skll_logger('experiment')

        # submit the jobs (if running on grid)
        if not local and _HAVE_GRIDMAP:
            if log_path:
                job_results = process_jobs(jobs,
                                           white_list=hosts,
                                           temp_dir=log_path)
            else:
                job_results = process_jobs(jobs, white_list=hosts)
            _check_job_results(job_results)

        # write out the summary results file
        if (task == 'cross_validate' or task == 'evaluate') and write_summary:
            summary_file_name = experiment_name + '_summary.tsv'
            with open(join(results_path, summary_file_name), 'w',
                      newline='') as output_file:
                _write_summary_file(result_json_paths,
                                    output_file,
                                    ablation=ablation)
        elif task == 'learning_curve':
            output_file_name = experiment_name + '_summary.tsv'
            output_file_path = join(results_path, output_file_name)
            with open(output_file_path, 'w', newline='') as output_file:
                _write_learning_curve_file(result_json_paths, output_file)

            # generate the actual plot if we have the requirements installed
            generate_learning_curve_plots(experiment_name, results_path,
                                          output_file_path)

    finally:

        # Close/remove any logger handlers
        close_and_remove_logger_handlers(get_skll_logger('experiment'))

    return result_json_paths
Ejemplo n.º 6
0
def parse_config_file(config_path, log_level=logging.INFO):
    """
    Parses a SKLL experiment configuration file with the given path.
    Log messages with the given log level (default: INFO).

    Parameters
    ----------
    config_path : str
        The path to the configuration file.
    log_level : logging level, optional
        The logging level to use.
        Defaults to ``logging.INFO``.

    Returns
    -------
    experiment_name : str
        A string used to identify this particular experiment configuration.
        When generating result summary files, this name helps prevent
        overwriting previous summaries.
    task : str
        The types of experiment we're trying to run (e.g. 'cross_validate').
    sampler : str
        The name of a sampler to perform non-linear transformations of the input.
    fixed_sampler_parameters : dict
        A dictionary containing parameters you want to have fixed for the sampler
    feature_hasher : bool
        If True, this enables a high-speed, low-memory vectorizer that uses
        feature hashing for converting feature dictionaries into NumPy arrays
        instead of using a DictVectorizer.
    hasher_features : int
        The number of features used by the FeatureHasher if the feature_hasher
        flag is enabled.
    id_col : str
        The column with IDs.
    label_col : str
        The column with labels.
    train_set_name : str
        The name of the training set.
    test_set_name : str
        The name of the test set.
    suffix : str
        The file format the training/test files are in.
    featuresets : list of str
        A list of lists of prefixes for the files containing
        the features you would like to train/test on.
    do_shuffle : bool
        Whether to shuffle the data.
    model_path : str
        The path to the model file(s).
    do_grid_search : bool
        Whether to perform grid search.
    grid_objectives : list of str
        A list of scoring functions to use for tuning.
    probability : bool
        Whether to output probabilities for each class.
    pipeline : bool
        Whether to include the `pipeline` attribute in the
        trained model. This will increase the size of the
        model file.
    results_path : str
        Path to store result files in.
    pos_label_str : str
        The string label for the positive class in the binary
        classification setting.
    feature_scaling : str
        How to scale features (e.g. 'with_mean').
    min_feature_count : int
        The minimum number of examples for which the value of a
        feature must be nonzero to be included in the model.
    folds_file : str
        The path to the folds_file, if specified.
    grid_search_jobs : int
        Number of folds to run in parallel when using grid search.
    grid_search_folds : int
        The number of folds to use for grid search.
    cv_folds : dict or int
        The specified folds mapping, or the number of folds.
    save_cv_folds : bool
        Whether to save CV Folds to file.
    save_cv_models : bool
        Whether to save CV models.
    use_folds_file_for_grid_search : bool
        Whether to use folds file for grid search.
    do_stratified_folds : bool
        Whether to use random folds for cross-validation.
    fixed_parameter_list : list of dict
        List of dicts containing parameters you want to have fixed for
        each classifier in learners list.
    param_grid_list : list of dict
        List of parameter grids to search, one dict for each learner.
    featureset_names : list of str
        The names of the featuresets used for each job.
    learners : list of str
        A list of learners to try using.
    prediction_dir : str
        The directories where predictions are saved.
    log_path : str
        The path to the log file.
    train_path : str
        The path to a file containing feature to train on.
    test_path : str
        The path to a file containing features to test on.
    ids_to_floats : bool
        Whether to convert IDs to floats.
    class_map : dict
        A class map collapsing several labels into one.
    custom_learner_path : str
        Path to a .py file that defines a custom learner.
    custom_metric_path : str
        Path to a .py file that defines a custom metric.
    learning_curve_cv_folds_list : list of int
        A list of integers specifying the number of folds to use for CV.
    learning_curve_train_sizes : list of float or list of int
        List of floats or integers representing relative or absolute numbers
        of training examples that will be used to generate the learning
        curve respectively.
    output_metrics : list
        A list of output metrics to use.

    Raises
    ------
    IOError
        If configuration file name is empty
    ValueError
        If various configuration parameters are incorrectly specified,
        or cause conflicts.
    """

    # check that config_path is not empty
    if config_path == "":
        raise IOError("The name of the configuration file is empty")

    # compute the absolute path for the config file
    config_path = realpath(config_path)
    config_dir = dirname(config_path)

    # set up a config parser with the above default values
    config = _setup_config_parser(config_path)

    # extract parameters from the various sections in the config file

    ######################
    # 1. General section #
    ######################
    if config.has_option("General", "experiment_name"):
        experiment_name = config.get("General", "experiment_name")
    else:
        raise ValueError("Configuration file does not contain experiment_name "
                         "in the [General] section.")

    # next, get the log path before anything else since we need to
    # save all logging messages to a log file in addition to displaying
    # them on the console
    try:
        log_path = locate_file(config.get("Output", "log"), config_dir)
    except IOError as e:
        if e.errno == errno.ENOENT:
            log_path = e.filename
            os.makedirs(log_path)

    # Create a top-level log file under the log path
    main_log_file = join(log_path, '{}.log'.format(experiment_name))

    # Now create a SKLL logger that will log to this file as well
    # as to the console. Use the log level provided - note that
    # we only have to do this the first time we call `get_skll_logger()`
    # with a given name.
    logger = get_skll_logger('experiment',
                             filepath=main_log_file,
                             log_level=log_level)

    if config.has_option("General", "task"):
        task = config.get("General", "task")
    else:
        raise ValueError("Configuration file does not contain task in the "
                         "[General] section.")
    if task not in VALID_TASKS:
        raise ValueError('An invalid task was specified: {}.  Valid tasks are:'
                         ' {}'.format(task, ', '.join(VALID_TASKS)))

    ####################
    # 2. Input section #
    ####################
    sampler = config.get("Input", "sampler")
    if sampler not in VALID_SAMPLERS:
        raise ValueError('An invalid sampler was specified: {}.  Valid '
                         'samplers are: {}'.format(sampler,
                                                   ', '.join(VALID_SAMPLERS)))

    # produce warnings if feature_hasher is set but hasher_features
    # is less than or equal to zero.
    feature_hasher = config.getboolean("Input", "feature_hasher")
    hasher_features = config.getint("Input", "hasher_features")
    if feature_hasher:
        if hasher_features <= 0:
            raise ValueError(
                "Configuration file must specify a non-zero value "
                "for the option hasher_features when "
                "feature_hasher is True.")

    # produce warnings if hasher_features is set but feature_hasher
    # is not set correctly
    elif hasher_features > 0:
        logger.warning(
            "Ignoring hasher_features since feature_hasher is either"
            " missing or set to False.")

    if config.has_option("Input", "learners"):
        learners_string = config.get("Input", "learners")
    else:
        raise ValueError(
            "Configuration file does not contain list of learners "
            "in [Input] section.")
    learners = yaml.safe_load(fix_json(learners_string))

    if len(learners) == 0:
        raise ValueError(
            "Configuration file contains an empty list of learners"
            " in the [Input] section.")

    elif len(set(learners)) < len(learners):
        raise ValueError(
            'Configuration file contains the same learner multiple'
            ' times, which is not currently supported.  Please use'
            ' param_grids with tuning to find the optimal settings'
            ' for the learner.')
    custom_learner_path = locate_file(
        config.get("Input", "custom_learner_path"), config_dir)

    # get the custom metric path, if specified, and locate it
    custom_metric_path = locate_file(config.get("Input", "custom_metric_path"),
                                     config_dir)

    # get the featuresets
    featuresets_string = config.get("Input", "featuresets")
    featuresets = yaml.safe_load(fix_json(featuresets_string))

    # ensure that featuresets is either a list of features or a list of lists
    # of features
    if not isinstance(featuresets, list) or not all(
            isinstance(fs, list) for fs in featuresets):
        raise ValueError("The featuresets parameter should be a list of "
                         "features or a list of lists of features. You "
                         "specified: {}".format(featuresets))

    featureset_names = yaml.safe_load(
        fix_json(config.get("Input", "featureset_names")))

    # ensure that featureset_names is a list of strings, if specified
    if featureset_names:
        if (not isinstance(featureset_names, list)
                or not all([isinstance(fs, str) for fs in featureset_names])):
            raise ValueError(
                "The featureset_names parameter should be a list "
                "of strings. You specified: {}".format(featureset_names))

    # get the value for learning_curve_cv_folds and ensure
    # that it's a list of the same length as the value of
    # learners. If it's not specified, then we just assume
    # that we are using 10 folds for each learner.
    learning_curve_cv_folds_list_string = config.get(
        "Input", "learning_curve_cv_folds_list")
    learning_curve_cv_folds_list = yaml.safe_load(
        fix_json(learning_curve_cv_folds_list_string))
    if len(learning_curve_cv_folds_list) == 0:
        learning_curve_cv_folds_list = [10] * len(learners)
    else:
        if (not isinstance(learning_curve_cv_folds_list, list) or not all(
            [isinstance(fold, int) for fold in learning_curve_cv_folds_list])
                or not len(learning_curve_cv_folds_list) == len(learners)):
            raise ValueError(
                "The learning_curve_cv_folds parameter should "
                "be a list of integers of the same length as "
                "the number of learners. You specified: {}".format(
                    learning_curve_cv_folds_list))

    # get the value for learning_curve_train_sizes and ensure
    # that it's a list of either integers (sizes) or
    # floats (proportions). If it's not specified, then we just
    # assume that we are using np.linspace(0.1, 1.0, 5).
    learning_curve_train_sizes_string = config.get(
        "Input", "learning_curve_train_sizes")
    learning_curve_train_sizes = yaml.safe_load(
        fix_json(learning_curve_train_sizes_string))
    if len(learning_curve_train_sizes) == 0:
        learning_curve_train_sizes = np.linspace(0.1, 1.0, 5).tolist()
    else:
        if (not isinstance(learning_curve_train_sizes, list) or not all([
                isinstance(size, int) or isinstance(size, float)
                for size in learning_curve_train_sizes
        ])):
            raise ValueError(
                "The learning_curve_train_sizes parameter should "
                "be a list of integers or floats. You specified: {}".format(
                    learning_curve_train_sizes))

    # do we need to shuffle the training data
    do_shuffle = config.getboolean("Input", "shuffle")

    fixed_parameter_list = yaml.safe_load(
        fix_json(config.get("Input", "fixed_parameters")))
    fixed_sampler_parameters = fix_json(
        config.get("Input", "sampler_parameters"))
    fixed_sampler_parameters = yaml.safe_load(fixed_sampler_parameters)
    param_grid_list = yaml.safe_load(
        fix_json(config.get("Tuning", "param_grids")))

    # read and normalize the value of `pos_label_str`
    pos_label_str = safe_float(config.get("Tuning", "pos_label_str"))
    if pos_label_str == '':
        pos_label_str = None

    # ensure that feature_scaling is specified only as one of the
    # four available choices
    feature_scaling = config.get("Input", "feature_scaling")
    if feature_scaling not in VALID_FEATURE_SCALING_OPTIONS:
        raise ValueError(
            "Invalid value for feature_scaling parameter: {}".format(
                feature_scaling))

    suffix = config.get("Input", "suffix")
    label_col = config.get("Input", "label_col")
    id_col = config.get("Input", "id_col")
    ids_to_floats = config.getboolean("Input", "ids_to_floats")

    # if an external folds file is specified, then read it into a dictionary
    folds_file = locate_file(config.get("Input", "folds_file"), config_dir)
    num_cv_folds = config.getint("Input", "num_cv_folds")
    specified_folds_mapping = None
    specified_num_folds = None
    if folds_file:
        specified_folds_mapping = load_cv_folds(folds_file,
                                                ids_to_floats=ids_to_floats)
    else:
        # if no file is specified, then set the number of folds for cross-validation
        specified_num_folds = num_cv_folds if num_cv_folds else 10

    # whether or not to save the cv fold ids/models
    save_cv_folds = config.getboolean("Output", "save_cv_folds")
    save_cv_models = config.getboolean("Output", "save_cv_models")

    # whether or not to do stratified cross validation
    random_folds = config.getboolean("Input", "random_folds")
    if random_folds:
        if folds_file:
            logger.warning('Specifying "folds_file" overrides "random_folds".')
        do_stratified_folds = False
    else:
        do_stratified_folds = True

    # get all the input paths and directories (without trailing slashes)
    train_path = config.get("Input", "train_directory").rstrip(os.sep)
    test_path = config.get("Input", "test_directory").rstrip(os.sep)
    train_file = config.get("Input", "train_file")
    test_file = config.get("Input", "test_file")

    # make sure that featuresets is not an empty list unless
    # train_file and test_file are specified
    if not train_file and not test_file and (isinstance(featuresets, list)
                                             and len(featuresets) == 0):
        raise ValueError(
            "The 'featuresets' parameters cannot be an empty list.")

    # The user must specify either train_file or train_path, not both.
    if not train_file and not train_path:
        raise ValueError('Invalid [Input] parameters: either "train_file" or '
                         '"train_directory" must be specified in the '
                         'configuration file.')

    # Either train_file or train_path must be specified.
    if train_file and train_path:
        raise ValueError('Invalid [Input] parameters: only either "train_file"'
                         ' or "train_directory" can be specified in the '
                         'configuration file, not both.')

    # Cannot specify both test_file and test_path
    if test_file and test_path:
        raise ValueError('Invalid [Input] parameters: only either "test_file" '
                         'or "test_directory" can be specified in the '
                         'configuration file, not both.')

    # if train_file is specified, then assign its value to train_path
    # this is a workaround to make this simple use case (a single train and
    # test file) compatible with the existing architecture using
    # featuresets
    if train_file:
        train_path = train_file
        featuresets = [['train_{}'.format(basename(train_file))]]
        suffix = ''

    # if test_file is specified, then assign its value to test_path to
    # enable compatibility with the pre-existing featuresets architecture
    if test_file:
        test_path = test_file
        featuresets[0][0] += '_test_{}'.format(basename(test_file))

    # make sure all the specified paths/files exist
    train_path = locate_file(train_path, config_dir)
    test_path = locate_file(test_path, config_dir)

    # Get class mapping dictionary if specified
    class_map_string = config.get("Input", "class_map")
    original_class_map = yaml.safe_load(fix_json(class_map_string))
    if original_class_map:
        # Change class_map to map from originals to replacements instead of
        # from replacement to list of originals
        class_map = {}
        for replacement, original_list in original_class_map.items():
            for original in original_list:
                class_map[original] = replacement
        del original_class_map
    else:
        class_map = None

    #####################
    # 3. Output section #
    #####################
    probability = config.getboolean("Output", "probability")
    pipeline = config.getboolean("Output", "pipeline")

    # do we want to keep the predictions?
    # make sure the predictions path exists and if not create it
    try:
        prediction_dir = locate_file(config.get("Output", "predictions"),
                                     config_dir)
    except IOError as e:
        if e.errno == errno.ENOENT:
            prediction_dir = e.filename
            os.makedirs(prediction_dir)

    # make sure model path exists and if not, create it
    try:
        model_path = locate_file(config.get("Output", "models"), config_dir)
    except IOError as e:
        if e.errno == errno.ENOENT:
            model_path = e.filename
            os.makedirs(model_path)

    # make sure results path exists
    try:
        results_path = locate_file(config.get("Output", "results"), config_dir)
    except IOError as e:
        if e.errno == errno.ENOENT:
            results_path = e.filename
            os.makedirs(results_path)

    # what are the output metrics?
    output_metrics = config.get("Output", "metrics")
    output_metrics = _parse_and_validate_metrics(output_metrics,
                                                 'metrics',
                                                 logger=logger)

    #####################
    # 4. Tuning section #
    #####################

    # do we need to run a grid search for the hyperparameters or are we just
    # using the defaults?
    do_grid_search = config.getboolean("Tuning", "grid_search")

    # parse any provided grid objective functions
    grid_objectives = config.get("Tuning", "objectives")
    grid_objectives = _parse_and_validate_metrics(grid_objectives,
                                                  'objectives',
                                                  logger=logger)

    # if we are doing learning curves , we don't care about
    # grid search
    if task == 'learning_curve' and do_grid_search:
        do_grid_search = False
        logger.warning("Grid search is not supported during "
                       "learning curve generation. Disabling.")

    # Check if `param_grids` is specified, but `do_grid_search` is False
    if param_grid_list and not do_grid_search:
        logger.warning('Since "grid_search" is set to False, the specified'
                       ' "param_grids" will be ignored.')

    # Warn user about potential conflicts between parameter values
    # specified in `fixed_parameter_list` and values specified in
    # `param_grid_list` (or values passed in by default) if
    # `do_grid_search` is True
    if do_grid_search and fixed_parameter_list:
        logger.warning('Note that "grid_search" is set to True and '
                       '"fixed_parameters" is also specified. If there '
                       'is a conflict between the grid search parameter'
                       ' space and the fixed parameter values, the '
                       'fixed parameter values will take precedence.')

    # minimum number of examples a feature must be nonzero in to be included
    min_feature_count = config.getint("Tuning", "min_feature_count")

    # if an external folds file was specified do we use the same folds file
    # for the inner grid-search in cross-validate as well?
    use_folds_file_for_grid_search = config.getboolean(
        "Tuning", "use_folds_file_for_grid_search")

    # how many jobs should we run in parallel for grid search
    grid_search_jobs = config.getint("Tuning", "grid_search_jobs")
    if not grid_search_jobs:
        grid_search_jobs = None

    # how many folds should we run in parallel for grid search
    grid_search_folds = config.getint("Tuning", "grid_search_folds")

    # check whether the right things are set for the given task
    if (task == 'evaluate' or task == 'predict') and not test_path:
        raise ValueError('The test set must be set when task is evaluate or '
                         'predict.')
    if task in ['cross_validate', 'evaluate', 'train']:
        if do_grid_search and len(grid_objectives) == 0:
            raise ValueError(
                'Grid search is on. Either specify a list of tuning '
                'objectives or set `grid_search` to `false` in the '
                'Tuning section.')
        if not do_grid_search and len(grid_objectives) > 0:
            logger.warning('Since "grid_search" is set to False, any specified'
                           ' "objectives" will be ignored.')
            grid_objectives = []
    if task in ['cross_validate', 'train', 'learning_curve'] and test_path:
        raise ValueError('The test set should not be set when task is '
                         '{}.'.format(task))
    if task in ['train', 'predict'] and results_path and not do_grid_search:
        raise ValueError('The results path should not be set when task is '
                         '{} and "grid_search" is set to False.'.format(task))
    if task == 'train' and not model_path:
        raise ValueError('The model path should be set when task is train.')
    if task in ['learning_curve', 'train'] and prediction_dir:
        raise ValueError('The predictions path should not be set when task is '
                         '{}.'.format(task))
    if task == 'learning_curve' and model_path:
        raise ValueError('The models path should not be set when task is '
                         'learning_curve.')
    if task == 'learning_curve':
        if len(grid_objectives) > 0:
            raise ValueError("The \"objectives\" option "
                             "is no longer supported for the "
                             "\"learning_curve\" "
                             "task. Please use the \"metrics\" "
                             "option in the [Output] "
                             "section instead.")
        if len(output_metrics) == 0:
            raise ValueError('The "metrics" option must be set when '
                             'the task is "learning_curve".')

    # if any of the objectives or metrics require probabilities to be output,
    # probability must be specified as true
    specified_probabilistic_metrics = PROBABILISTIC_METRICS.intersection(
        grid_objectives + output_metrics)
    if specified_probabilistic_metrics and not probability:
        raise ValueError("The 'probability' option must be 'true' "
                         " to compute the following: "
                         "{}.".format(list(specified_probabilistic_metrics)))

    # set the folds appropriately based on the task:
    #  (a) if the task is `train`/`evaluate`/`predict` and if an external
    #      fold mapping is specified then use that mapping for grid search
    #      instead of the value contained in `grid_search_folds`.
    #  (b) if the task is `cross_validate` and an external fold mapping is specified
    #      then use that mapping for the outer CV loop and for the inner grid-search
    #      loop. However, if  `use_folds_file_for_grid_search` is `False`, do not
    #      use the fold mapping for the inner loop.
    cv_folds = None
    if task in ['train', 'evaluate', 'predict'] and specified_folds_mapping:
        grid_search_folds = specified_folds_mapping
        # only print out the warning if the user actually wants to do grid search
        if do_grid_search:
            logger.warning("Specifying \"folds_file\" overrides both "
                           "explicit and default \"grid_search_folds\".")
    if task == 'cross_validate':
        cv_folds = specified_folds_mapping if specified_folds_mapping else specified_num_folds
        if specified_folds_mapping:
            logger.warning("Specifying \"folds_file\" overrides both "
                           "explicit and default \"num_cv_folds\".")
            if use_folds_file_for_grid_search:
                grid_search_folds = cv_folds
            else:
                # only print out the warning if the user wants to do grid search
                if do_grid_search:
                    logger.warning("The specified \"folds_file\" will "
                                   "not be used for inner grid search.")
        if save_cv_models is True and not model_path:
            raise ValueError("Output directory for models must be set if "
                             "\"save_cv_models\" is set to true.")

    # Create feature set names if unspecified
    if not featureset_names:
        featureset_names = [_munge_featureset_name(x) for x in featuresets]
    if len(featureset_names) != len(featuresets):
        raise ValueError(('Number of feature set names (%s) does not match '
                          'number of feature sets (%s).') %
                         (len(featureset_names), len(featuresets)))

    # store training/test set names for later use
    train_set_name = basename(train_path)
    test_set_name = basename(test_path) if test_path else "cv"

    return (experiment_name, task, sampler, fixed_sampler_parameters,
            feature_hasher, hasher_features, id_col, label_col, train_set_name,
            test_set_name, suffix, featuresets, do_shuffle, model_path,
            do_grid_search, grid_objectives, probability, pipeline,
            results_path, pos_label_str, feature_scaling, min_feature_count,
            folds_file, grid_search_jobs, grid_search_folds, cv_folds,
            save_cv_folds, save_cv_models, use_folds_file_for_grid_search,
            do_stratified_folds, fixed_parameter_list, param_grid_list,
            featureset_names, learners, prediction_dir, log_path, train_path,
            test_path, ids_to_floats, class_map, custom_learner_path,
            custom_metric_path, learning_curve_cv_folds_list,
            learning_curve_train_sizes, output_metrics)