Example #1
0
def test_learning_curve_implementation():
    """
    Test to ensure that the learning curve results match scikit-learn
    """

    # This test is different from the other tests which just use regression data.
    # The reason is that we want this test to fail in case our implementation
    # diverges from the scikit-learn implementation. This test essentially
    # serves as a regression test as well.

    # Load in the digits data set
    digits = load_digits()
    X, y = digits.data, digits.target

    # get the learning curve results from scikit-learn for this data
    cv_folds = 10
    random_state = 123456789
    cv = ShuffleSplit(n_splits=cv_folds, test_size=0.2, random_state=random_state)
    estimator = MultinomialNB()
    train_sizes = np.linspace(.1, 1.0, 5)
    train_sizes1, train_scores1, test_scores1 = learning_curve(estimator,
                                                               X,
                                                               y,
                                                               cv=cv,
                                                               train_sizes=train_sizes,
                                                               scoring='accuracy')

    # get the features from this data into a FeatureSet instance we can use
    # with the SKLL API
    feature_names = ['f{:02}'.format(n) for n in range(X.shape[1])]
    features = []
    for row in X:
        features.append(dict(zip(feature_names, row)))
    fs = FeatureSet('train', features=features, labels=y, ids=list(range(X.shape[0])))

    # we don't want to filter out any features since scikit-learn
    # does not do that either
    learner = Learner('MultinomialNB', min_feature_count=0)
    (train_scores2,
     test_scores2,
     train_sizes2) = learner.learning_curve(fs,
                                            cv_folds=cv_folds,
                                            train_sizes=train_sizes,
                                            metric='accuracy')

    assert np.all(train_sizes1 == train_sizes2)
    assert np.allclose(train_scores1, train_scores2)
    assert np.allclose(test_scores1, test_scores2)
Example #2
0
def _classify_featureset(args):
    """
    Classification job to be submitted to grid.

    Parameters
    ----------
    args : dict
        A dictionary with arguments for classifying the
        ``FeatureSet`` instance.

    Returns
    -------
    res : list of dicts
        The results of the classification, in the format
        of a list of dictionaries.

    Raises
    ------
    ValueError
        If extra unknown arguments are passed to the function.
    """

    # Extract all the arguments.
    # (There doesn't seem to be a better way to do this since one can't specify
    # required keyword arguments.)

    experiment_name = args.pop("experiment_name")
    task = args.pop("task")
    sampler = args.pop("sampler")
    feature_hasher = args.pop("feature_hasher")
    hasher_features = args.pop("hasher_features")
    job_name = args.pop("job_name")
    featureset = args.pop("featureset")
    featureset_name = args.pop("featureset_name")
    learner_name = args.pop("learner_name")
    train_path = args.pop("train_path")
    test_path = args.pop("test_path")
    train_set_name = args.pop("train_set_name")
    test_set_name = args.pop("test_set_name")
    shuffle = args.pop('shuffle')
    model_path = args.pop("model_path")
    prediction_prefix = args.pop("prediction_prefix")
    grid_search = args.pop("grid_search")
    grid_objective = args.pop("grid_objective")
    output_metrics = args.pop("output_metrics")
    suffix = args.pop("suffix")
    job_log_file = args.pop("log_file")
    job_log_level = args.pop("log_level")
    probability = args.pop("probability")
    pipeline = args.pop("pipeline")
    results_path = args.pop("results_path")
    fixed_parameters = args.pop("fixed_parameters")
    sampler_parameters = args.pop("sampler_parameters")
    param_grid = args.pop("param_grid")
    pos_label_str = args.pop("pos_label_str")
    overwrite = args.pop("overwrite")
    feature_scaling = args.pop("feature_scaling")
    min_feature_count = args.pop("min_feature_count")
    folds_file = args.pop("folds_file")
    grid_search_jobs = args.pop("grid_search_jobs")
    grid_search_folds = args.pop("grid_search_folds")
    cv_folds = args.pop("cv_folds")
    save_cv_folds = args.pop("save_cv_folds")
    save_cv_models = args.pop("save_cv_models")
    use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search")
    stratified_folds = args.pop("do_stratified_folds")
    label_col = args.pop("label_col")
    id_col = args.pop("id_col")
    ids_to_floats = args.pop("ids_to_floats")
    class_map = args.pop("class_map")
    custom_learner_path = args.pop("custom_learner_path")
    custom_metric_path = args.pop("custom_metric_path")
    quiet = args.pop('quiet', False)
    learning_curve_cv_folds = args.pop("learning_curve_cv_folds")
    learning_curve_train_sizes = args.pop("learning_curve_train_sizes")

    if args:
        raise ValueError(("Extra arguments passed to _classify_featureset: "
                          "{}").format(args.keys()))
    start_timestamp = datetime.datetime.now()

    # create a new SKLL logger for this specific job and
    # use the given log level
    logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level)

    try:

        # log messages
        logger.info("Task: {}".format(task))

        # check if we have any possible custom metrics
        possible_custom_metric_names = []
        for metric_name in output_metrics + [grid_objective]:
            # metrics that are not in `SCORERS` or `None` are candidates
            # (the `None` is a by-product of how jobs with single tuning
            # objectives are created)
            if metric_name not in SCORERS and metric_name is not None:
                possible_custom_metric_names.append(metric_name)
            # if the metric is already in `SCORERS`, is it a custom one
            # that we already registered? if so, log that
            elif metric_name in _CUSTOM_METRICS:
                logger.info(
                    f"custom metric '{metric_name}' is already registered")

        # initialize list that will hold any invalid metrics
        # that we could not register as custom metrics
        invalid_metric_names = []

        # if we have possible custom metrics
        if possible_custom_metric_names:

            # check that we have a file to load them from
            if not custom_metric_path:
                raise ValueError(
                    f"invalid metrics specified: {possible_custom_metric_names}"
                )
            else:
                # try to register each possible custom metric
                # raise an exception if we fail, if we don't then
                # add the custom metric function to `globals()` so
                # that it serializes properly for gridmap
                for custom_metric_name in possible_custom_metric_names:
                    try:
                        custom_metric_func = register_custom_metric(
                            custom_metric_path, custom_metric_name)
                    except (AttributeError, NameError, ValueError):
                        invalid_metric_names.append(custom_metric_name)
                    else:
                        logger.info(f"registered '{custom_metric_name}' as a "
                                    f"custom metric")
                        globals()[custom_metric_name] = custom_metric_func

        # raise an error if we have any invalid metrics
        if invalid_metric_names:
            raise ValueError(
                f"invalid metrics specified: {invalid_metric_names}. "
                f"If these are custom metrics, check the function "
                f"names.")

        if task == 'cross_validate':
            if isinstance(cv_folds, int):
                num_folds = cv_folds
            else:  # folds_file was used, so count the unique fold ids.
                num_folds = len(set(cv_folds.values()))
            logger.info("Cross-validating ({} folds) on {}, feature "
                        "set {} ...".format(num_folds, train_set_name,
                                            featureset))
        elif task == 'evaluate':
            logger.info("Training on {}, Test on {}, "
                        "feature set {} ...".format(train_set_name,
                                                    test_set_name, featureset))
        elif task == 'train':
            logger.info("Training on {}, feature set {} ...".format(
                train_set_name, featureset))
        elif task == 'learning_curve':
            logger.info("Generating learning curve "
                        "({} 80/20 folds, sizes={}, objective={}) on {}, "
                        "feature set {} ...".format(
                            learning_curve_cv_folds,
                            learning_curve_train_sizes, grid_objective,
                            train_set_name, featureset))
        else:  # predict
            logger.info("Training on {}, Making predictions on {}, "
                        "feature set {} ...".format(train_set_name,
                                                    test_set_name, featureset))

        # check whether a trained model on the same data with the same
        # featureset already exists if so, load it and then use it on test data
        modelfile = join(model_path, '{}.model'.format(job_name))
        if (task in ['cross_validate', 'learning_curve']
                or not exists(modelfile) or overwrite):
            train_examples = load_featureset(train_path,
                                             featureset,
                                             suffix,
                                             label_col=label_col,
                                             id_col=id_col,
                                             ids_to_floats=ids_to_floats,
                                             quiet=quiet,
                                             class_map=class_map,
                                             feature_hasher=feature_hasher,
                                             num_features=hasher_features,
                                             logger=logger)

            train_set_size = len(train_examples.ids)
            if not train_examples.has_labels:
                raise ValueError('Training examples do not have labels')
            # initialize a classifer object
            learner = Learner(learner_name,
                              probability=probability,
                              pipeline=pipeline,
                              feature_scaling=feature_scaling,
                              model_kwargs=fixed_parameters,
                              pos_label_str=pos_label_str,
                              min_feature_count=min_feature_count,
                              sampler=sampler,
                              sampler_kwargs=sampler_parameters,
                              custom_learner_path=custom_learner_path,
                              logger=logger)

        # load the model if it already exists
        else:
            # import custom learner into global namespace if we are reusing
            # a saved model
            if custom_learner_path:
                globals()[learner_name] = load_custom_learner(
                    custom_learner_path, learner_name)
            train_set_size = 'unknown'
            if exists(modelfile) and not overwrite:
                logger.info("Loading pre-existing {} model: {}".format(
                    learner_name, modelfile))
            learner = Learner.from_file(modelfile)

            # attach the job logger to this learner
            learner.logger = logger

        # Load test set if there is one
        if task == 'evaluate' or task == 'predict':
            test_examples = load_featureset(test_path,
                                            featureset,
                                            suffix,
                                            label_col=label_col,
                                            id_col=id_col,
                                            ids_to_floats=ids_to_floats,
                                            quiet=quiet,
                                            class_map=class_map,
                                            feature_hasher=feature_hasher,
                                            num_features=hasher_features)
            test_set_size = len(test_examples.ids)
        else:
            test_set_size = 'n/a'

        # compute information about xval and grid folds that can be put in results
        # in readable form
        if isinstance(cv_folds, dict):
            cv_folds_to_print = '{} via folds file'.format(
                len(set(cv_folds.values())))
        else:
            cv_folds_to_print = str(cv_folds)

        if isinstance(grid_search_folds, dict):
            grid_search_folds_to_print = \
                '{} via folds file'.format(len(set(grid_search_folds.values())))
        else:
            grid_search_folds_to_print = str(grid_search_folds)

        # create a list of dictionaries of the results information
        learner_result_dict_base = {
            'experiment_name':
            experiment_name,
            'train_set_name':
            train_set_name,
            'train_set_size':
            train_set_size,
            'test_set_name':
            test_set_name,
            'test_set_size':
            test_set_size,
            'featureset':
            json.dumps(featureset),
            'featureset_name':
            featureset_name,
            'shuffle':
            shuffle,
            'learner_name':
            learner_name,
            'task':
            task,
            'start_timestamp':
            start_timestamp.strftime('%d %b %Y %H:%M:'
                                     '%S.%f'),
            'version':
            __version__,
            'feature_scaling':
            feature_scaling,
            'folds_file':
            folds_file,
            'grid_search':
            grid_search,
            'grid_objective':
            grid_objective,
            'grid_search_folds':
            grid_search_folds_to_print,
            'min_feature_count':
            min_feature_count,
            'cv_folds':
            cv_folds_to_print,
            'using_folds_file':
            isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict),
            'save_cv_folds':
            save_cv_folds,
            'save_cv_models':
            save_cv_models,
            'use_folds_file_for_grid_search':
            use_folds_file_for_grid_search,
            'stratified_folds':
            stratified_folds,
            'scikit_learn_version':
            SCIKIT_VERSION
        }

        # check if we're doing cross-validation, because we only load/save
        # models when we're not.
        task_results = None
        if task == 'cross_validate':
            logger.info('Cross-validating')
            (
                task_results, grid_scores, grid_search_cv_results_dicts,
                skll_fold_ids, models
            ) = learner.cross_validate(
                train_examples,
                shuffle=shuffle,
                stratified=stratified_folds,
                prediction_prefix=prediction_prefix,
                grid_search=grid_search,
                grid_search_folds=grid_search_folds,
                cv_folds=cv_folds,
                grid_objective=grid_objective,
                output_metrics=output_metrics,
                param_grid=param_grid,
                grid_jobs=grid_search_jobs,
                save_cv_folds=save_cv_folds,
                save_cv_models=save_cv_models,
                use_custom_folds_for_grid_search=use_folds_file_for_grid_search
            )
            if models:
                for index, m in enumerate(models, start=1):
                    modelfile = join(model_path,
                                     '{}_fold{}.model'.format(job_name, index))
                    m.save(modelfile)
        elif task == 'learning_curve':
            logger.info("Generating learning curve(s)")
            (curve_train_scores, curve_test_scores,
             computed_curve_train_sizes) = learner.learning_curve(
                 train_examples,
                 grid_objective,
                 cv_folds=learning_curve_cv_folds,
                 train_sizes=learning_curve_train_sizes)
        else:
            # if we have do not have a saved model, we need to train one.
            if not exists(modelfile) or overwrite:
                logger.info("Featurizing and training new {} model".format(
                    learner_name))

                (best_score, grid_search_cv_results) = learner.train(
                    train_examples,
                    shuffle=shuffle,
                    grid_search=grid_search,
                    grid_search_folds=grid_search_folds,
                    grid_objective=grid_objective,
                    param_grid=param_grid,
                    grid_jobs=grid_search_jobs)
                grid_scores = [best_score]
                grid_search_cv_results_dicts = [grid_search_cv_results]

                # save model
                if model_path:
                    learner.save(modelfile)

                if grid_search:
                    logger.info("Best {} grid search score: {}".format(
                        grid_objective, round(best_score, 3)))
            else:
                grid_scores = [None]
                grid_search_cv_results_dicts = [None]

            # print out the parameters
            param_out = ('{}: {}'.format(param_name, param_value)
                         for param_name, param_value in
                         learner.model.get_params().items())
            logger.info("Hyperparameters: {}".format(', '.join(param_out)))

            # run on test set or cross-validate on training data,
            # depending on what was asked for
            if task == 'evaluate':
                logger.info("Evaluating predictions")
                task_results = [
                    learner.evaluate(test_examples,
                                     prediction_prefix=prediction_prefix,
                                     grid_objective=grid_objective,
                                     output_metrics=output_metrics)
                ]
            elif task == 'predict':
                logger.info("Writing predictions")
                # we set `class_labels` to `False` so that if the learner is
                # probabilistic, probabilities are written instead of labels
                learner.predict(test_examples,
                                prediction_prefix=prediction_prefix,
                                class_labels=False)
            # do nothing here for train

        end_timestamp = datetime.datetime.now()
        learner_result_dict_base['end_timestamp'] = end_timestamp.strftime(
            '%d %b %Y %H:%M:%S.%f')
        total_time = end_timestamp - start_timestamp
        learner_result_dict_base['total_time'] = str(total_time)

        if task == 'cross_validate' or task == 'evaluate':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = _create_learner_result_dicts(task_results, grid_scores,
                                               grid_search_cv_results_dicts,
                                               learner_result_dict_base)

            # write out the result dictionary to a json file
            with open(results_json_path, 'w') as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

            with open(join(results_path, '{}.results'.format(job_name)),
                      'w') as output_file:
                _print_fancy_output(res, output_file)

        elif task == 'learning_curve':
            results_json_path = join(results_path,
                                     '{}.results.json'.format(job_name))

            res = {}
            res.update(learner_result_dict_base)
            res.update({
                'learning_curve_cv_folds':
                learning_curve_cv_folds,
                'given_curve_train_sizes':
                learning_curve_train_sizes,
                'learning_curve_train_scores_means':
                np.mean(curve_train_scores, axis=1),
                'learning_curve_test_scores_means':
                np.mean(curve_test_scores, axis=1),
                'learning_curve_train_scores_stds':
                np.std(curve_train_scores, axis=1, ddof=1),
                'learning_curve_test_scores_stds':
                np.std(curve_test_scores, axis=1, ddof=1),
                'computed_curve_train_sizes':
                computed_curve_train_sizes
            })

            # we need to return and write out a list of dictionaries
            res = [res]

            # write out the result dictionary to a json file
            with open(results_json_path, 'w') as json_file:
                json.dump(res, json_file, cls=NumpyTypeEncoder)

        # For all other tasks, i.e. train or predict
        else:
            if results_path:
                results_json_path = join(results_path,
                                         '{}.results.json'.format(job_name))

                assert len(grid_scores) == 1
                assert len(grid_search_cv_results_dicts) == 1
                grid_search_cv_results_dict = {"grid_score": grid_scores[0]}
                grid_search_cv_results_dict["grid_search_cv_results"] = \
                    grid_search_cv_results_dicts[0]
                grid_search_cv_results_dict.update(learner_result_dict_base)
                # write out the result dictionary to a json file
                with open(results_json_path, 'w') as json_file:
                    json.dump(grid_search_cv_results_dict,
                              json_file,
                              cls=NumpyTypeEncoder)
            res = [learner_result_dict_base]

        # write out the cv folds if required
        if task == 'cross_validate' and save_cv_folds:
            skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
            with open(join(results_path, skll_fold_ids_file),
                      'w') as output_file:
                _write_skll_folds(skll_fold_ids, output_file)

    finally:
        close_and_remove_logger_handlers(logger)

    return res