def test_api_with_custom_prob_metric(): """Test API with custom probabilistic metric""" # register a custom metric from our file that requires probabilities input_dir = join(_my_dir, "other") custom_metrics_file = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "fake_prob_metric") # create some classification data train_fs, _ = make_classification_data(num_examples=1000, num_features=10, num_labels=2) # set up a learner to tune using this probabilistic metric # this should fail since LinearSVC doesn't support probabilities learner1 = Learner("LinearSVC") assert_raises_regex(AttributeError, r"has no attribute 'predict_proba'", learner1.train, train_fs, grid_objective="fake_prob_metric") # set up another learner with explicit probability support # this should work just fine with our custom metric learner2 = Learner("SVC", probability=True) grid_score, _ = learner2.train(train_fs, grid_objective="fake_prob_metric") ok_(grid_score > 0.95)
def test_custom_metric_api_experiment_with_kappa_filename(): """Test API with metric defined in a file named kappa""" # register a dummy metric that just returns 1 from # a file called 'kappa.py' input_dir = join(_my_dir, "other") custom_metrics_file = join(input_dir, "kappa.py") register_custom_metric(custom_metrics_file, "dummy_metric") # read in some train/test data train_file = join(input_dir, "examples_train.jsonlines") test_file = join(input_dir, "examples_test.jsonlines") train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file).read() # set up a learner to tune using our usual kappa metric # and evaluate it using the dummy metric we loaded # this should work as there should be no confict between # the two "kappa" names learner = Learner("LogisticRegression") _ = learner.train(train_fs, grid_objective="unweighted_kappa") results = learner.evaluate( test_fs, grid_objective="unweighted_kappa", output_metrics=["balanced_accuracy", "dummy_metric"]) test_objective_value = results[-2] test_output_metrics_dict = results[-1] test_accuracy_value = test_output_metrics_dict["balanced_accuracy"] test_dummy_metric_value = test_output_metrics_dict["dummy_metric"] # check that the values are as expected assert_almost_equal(test_objective_value, 0.9699, places=4) assert_almost_equal(test_accuracy_value, 0.9792, places=4) eq_(test_dummy_metric_value, 1.0)
def test_register_custom_metric_missing_file(): """Test loading custom metric from missing file""" # try to load a metric from a py file that does not exist metric_dir = join(_my_dir, "other") missing_custom_metrics_file = join(metric_dir, "missing_metrics.py") register_custom_metric(missing_custom_metrics_file, "f075_macro")
def test_custom_metric_api_experiment(): """Test API with custom metrics""" # register two different metrics from two files input_dir = join(_my_dir, "other") custom_metrics_file1 = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file1, "f075_macro") custom_metrics_file2 = join(input_dir, "custom_metrics2.py") register_custom_metric(custom_metrics_file2, "f06_micro") # read in some train/test data train_file = join(input_dir, "examples_train.jsonlines") test_file = join(input_dir, "examples_test.jsonlines") train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file).read() # set up a learner to tune using one of the custom metrics # and evaluate it using the other one learner = Learner("LogisticRegression") _ = learner.train(train_fs, grid_objective="f075_macro") results = learner.evaluate( test_fs, grid_objective="f075_macro", output_metrics=["balanced_accuracy", "f06_micro"]) test_objective_value = results[-2] test_output_metrics_dict = results[-1] test_accuracy_value = test_output_metrics_dict["balanced_accuracy"] test_f06_micro_value = test_output_metrics_dict["f06_micro"] # check that the values are as expected assert_almost_equal(test_objective_value, 0.9785, places=4) assert_almost_equal(test_accuracy_value, 0.9792, places=4) assert_almost_equal(test_f06_micro_value, 0.98, places=4)
def test_register_custom_metric_conflicting_metric_name(): """Test loading custom metric with conflicting name""" # try to load a metric that does not exist in a file metric_dir = join(_my_dir, "other") custom_metrics_file = join(metric_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "r2")
def test_register_custom_metric_bad_extension(): """Test loading custom metric from non-py file""" # try to load a metric from a txt file, not a py file metric_dir = join(_my_dir, "other") bad_custom_metrics_file = join(metric_dir, "custom_metrics.txt") register_custom_metric(bad_custom_metrics_file, "f075_macro")
def test_reregister_same_metric_same_session(): """Test loading custom metric again in same session""" # try to load a metric from a txt file, not a py file metric_dir = join(_my_dir, "other") custom_metrics_file = join(metric_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "f075_macro") # re-registering should raise an error register_custom_metric(custom_metrics_file, "f075_macro")
def test_reregister_same_metric_different_session(): """Test loading custom metric again in different session""" # try to load a metric from a txt file, not a py file metric_dir = join(_my_dir, "other") custom_metrics_file = join(metric_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "f075_macro") # clean up any already registered metrics to simulate # as if we are starting a new Python session _cleanup_custom_metrics() # now re-registering should work just fine register_custom_metric(custom_metrics_file, "f075_macro")
def test_register_custom_metric_load_both(): """Test loading two custom metrics from one file""" # load both metrics in the custom file metric_dir = join(_my_dir, "other") custom_metrics_file = join(metric_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "f075_macro") register_custom_metric(custom_metrics_file, "ratio_of_ones") # now make sure that both are registered assert "f075_macro" in _CUSTOM_METRICS assert "f075_macro" in SCORERS assert "ratio_of_ones" in _CUSTOM_METRICS assert "ratio_of_ones" in SCORERS
def test_register_custom_metric_load_different_files(): """Test loading two custom metrics from two files""" # load two custom metrics from two different files metric_dir = join(_my_dir, "other") custom_metrics_file1 = join(metric_dir, "custom_metrics.py") custom_metrics_file2 = join(metric_dir, "custom_metrics2.py") register_custom_metric(custom_metrics_file1, "f075_macro") register_custom_metric(custom_metrics_file2, "f06_micro") # make sure both are registered assert "f075_macro" in _CUSTOM_METRICS assert "f075_macro" in SCORERS assert "f06_micro" in _CUSTOM_METRICS assert "f06_micro" in SCORERS
def test_register_custom_metric_load_one(): """Test loading a single custom metric""" # load a single metric from a custom metric file metric_dir = join(_my_dir, "other") custom_metrics_file = join(metric_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "f075_macro") # make sure that this metric is now registered with SKLL assert "f075_macro" in _CUSTOM_METRICS assert "f075_macro" in SCORERS # make sure that the other metric in that same file # is _not_ registered with SKLL since we didn't ask for it assert "ratio_of_ones" not in _CUSTOM_METRICS assert "ratio_of_ones" not in SCORERS
def test_api_with_inverted_custom_metric(): """Test API with a lower-is-better custom metric""" # register a lower-is-better custom metrics from our file # which is simply 1 minus the precision score input_dir = join(_my_dir, "other") custom_metrics_file1 = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file1, "one_minus_precision") # create some classification data train_fs, _ = make_classification_data(num_examples=1000, num_features=10, num_labels=2) # set up a learner to tune using the lower-is-better custom metric learner1 = Learner("LogisticRegression") (grid_score1, grid_results_dict1) = learner1.train(train_fs, grid_objective="one_minus_precision") # now setup another learner that uses the complementary version # of our custom metric (regular precision) for grid search learner2 = Learner("LogisticRegression") (grid_score2, grid_results_dict2) = learner2.train(train_fs, grid_objective="precision") # for both learners the ranking of the C hyperparameter should be # should be the identical since when we defined one_minus_precision # we set the `greater_is_better` keyword argument to `False` assert_array_equal(grid_results_dict1['rank_test_score'], grid_results_dict2['rank_test_score']) # furthermore, the final grid score and the mean scores for each # C hyperparameter value should follow the same 1-X relationship # except that our custom metric should be negated due to the # keyword argument that we set when we defined it assert_almost_equal(1 - grid_score2, -1 * grid_score1, places=6) assert_array_almost_equal(1 - grid_results_dict2['mean_test_score'], -1 * grid_results_dict1['mean_test_score'], decimal=6)
def test_register_custom_metric_values(): """Test to check values of custom metrics""" # register two metrics in the same file metric_dir = join(_my_dir, "other") custom_metrics_file = join(metric_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "f075_macro") register_custom_metric(custom_metrics_file, "ratio_of_ones") # check that the values that SKLL would compute matches what we expect y_true = [1, 1, 1, 0, 2, 1, 2, 0, 1] y_pred = [0, 1, 1, 0, 1, 2, 0, 1, 2] skll_value = use_score_func("f075_macro", y_true, y_pred) sklearn_value = fbeta_score(y_true, y_pred, 0.75, average='macro') eq_(skll_value, sklearn_value) y_true = [1, 1, 1, 0] y_pred = [0, 1, 1, 0] skll_value = use_score_func("ratio_of_ones", y_true, y_pred) true_ones = len([true for true in y_true if true == 1]) pred_ones = len([pred for pred in y_pred if pred == 1]) expected_value = pred_ones / (true_ones + pred_ones) eq_(skll_value, expected_value)
def _classify_featureset(args): """ Classification job to be submitted to grid. Parameters ---------- args : dict A dictionary with arguments for classifying the ``FeatureSet`` instance. Returns ------- res : list of dicts The results of the classification, in the format of a list of dictionaries. Raises ------ ValueError If extra unknown arguments are passed to the function. """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") output_metrics = args.pop("output_metrics") suffix = args.pop("suffix") job_log_file = args.pop("log_file") job_log_level = args.pop("log_level") probability = args.pop("probability") pipeline = args.pop("pipeline") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") folds_file = args.pop("folds_file") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") save_cv_models = args.pop("save_cv_models") use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") custom_metric_path = args.pop("custom_metric_path") quiet = args.pop('quiet', False) learning_curve_cv_folds = args.pop("learning_curve_cv_folds") learning_curve_train_sizes = args.pop("learning_curve_train_sizes") if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() # create a new SKLL logger for this specific job and # use the given log level logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level) try: # log messages logger.info("Task: {}".format(task)) # check if we have any possible custom metrics possible_custom_metric_names = [] for metric_name in output_metrics + [grid_objective]: # metrics that are not in `SCORERS` or `None` are candidates # (the `None` is a by-product of how jobs with single tuning # objectives are created) if metric_name not in SCORERS and metric_name is not None: possible_custom_metric_names.append(metric_name) # if the metric is already in `SCORERS`, is it a custom one # that we already registered? if so, log that elif metric_name in _CUSTOM_METRICS: logger.info( f"custom metric '{metric_name}' is already registered") # initialize list that will hold any invalid metrics # that we could not register as custom metrics invalid_metric_names = [] # if we have possible custom metrics if possible_custom_metric_names: # check that we have a file to load them from if not custom_metric_path: raise ValueError( f"invalid metrics specified: {possible_custom_metric_names}" ) else: # try to register each possible custom metric # raise an exception if we fail, if we don't then # add the custom metric function to `globals()` so # that it serializes properly for gridmap for custom_metric_name in possible_custom_metric_names: try: custom_metric_func = register_custom_metric( custom_metric_path, custom_metric_name) except (AttributeError, NameError, ValueError): invalid_metric_names.append(custom_metric_name) else: logger.info(f"registered '{custom_metric_name}' as a " f"custom metric") globals()[custom_metric_name] = custom_metric_func # raise an error if we have any invalid metrics if invalid_metric_names: raise ValueError( f"invalid metrics specified: {invalid_metric_names}. " f"If these are custom metrics, check the function " f"names.") if task == 'cross_validate': if isinstance(cv_folds, int): num_folds = cv_folds else: # folds_file was used, so count the unique fold ids. num_folds = len(set(cv_folds.values())) logger.info("Cross-validating ({} folds) on {}, feature " "set {} ...".format(num_folds, train_set_name, featureset)) elif task == 'evaluate': logger.info("Training on {}, Test on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) elif task == 'train': logger.info("Training on {}, feature set {} ...".format( train_set_name, featureset)) elif task == 'learning_curve': logger.info("Generating learning curve " "({} 80/20 folds, sizes={}, objective={}) on {}, " "feature set {} ...".format( learning_curve_cv_folds, learning_curve_train_sizes, grid_objective, train_set_name, featureset)) else: # predict logger.info("Training on {}, Making predictions on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if (task in ['cross_validate', 'learning_curve'] or not exists(modelfile) or overwrite): train_examples = load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features, logger=logger) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, pipeline=pipeline, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path, logger=logger) # load the model if it already exists else: # import custom learner into global namespace if we are reusing # a saved model if custom_learner_path: globals()[learner_name] = load_custom_learner( custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: logger.info("Loading pre-existing {} model: {}".format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # attach the job logger to this learner learner.logger = logger # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # compute information about xval and grid folds that can be put in results # in readable form if isinstance(cv_folds, dict): cv_folds_to_print = '{} via folds file'.format( len(set(cv_folds.values()))) else: cv_folds_to_print = str(cv_folds) if isinstance(grid_search_folds, dict): grid_search_folds_to_print = \ '{} via folds file'.format(len(set(grid_search_folds.values()))) else: grid_search_folds_to_print = str(grid_search_folds) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'folds_file': folds_file, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds_to_print, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds_to_print, 'using_folds_file': isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict), 'save_cv_folds': save_cv_folds, 'save_cv_models': save_cv_models, 'use_folds_file_for_grid_search': use_folds_file_for_grid_search, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': logger.info('Cross-validating') ( task_results, grid_scores, grid_search_cv_results_dicts, skll_fold_ids, models ) = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, output_metrics=output_metrics, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds, save_cv_models=save_cv_models, use_custom_folds_for_grid_search=use_folds_file_for_grid_search ) if models: for index, m in enumerate(models, start=1): modelfile = join(model_path, '{}_fold{}.model'.format(job_name, index)) m.save(modelfile) elif task == 'learning_curve': logger.info("Generating learning curve(s)") (curve_train_scores, curve_test_scores, computed_curve_train_sizes) = learner.learning_curve( train_examples, grid_objective, cv_folds=learning_curve_cv_folds, train_sizes=learning_curve_train_sizes) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: logger.info("Featurizing and training new {} model".format( learner_name)) (best_score, grid_search_cv_results) = learner.train( train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] grid_search_cv_results_dicts = [grid_search_cv_results] # save model if model_path: learner.save(modelfile) if grid_search: logger.info("Best {} grid search score: {}".format( grid_objective, round(best_score, 3))) else: grid_scores = [None] grid_search_cv_results_dicts = [None] # print out the parameters param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in learner.model.get_params().items()) logger.info("Hyperparameters: {}".format(', '.join(param_out))) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': logger.info("Evaluating predictions") task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective, output_metrics=output_metrics) ] elif task == 'predict': logger.info("Writing predictions") # we set `class_labels` to `False` so that if the learner is # probabilistic, probabilities are written instead of labels learner.predict(test_examples, prediction_prefix=prediction_prefix, class_labels=False) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, grid_search_cv_results_dicts, learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) elif task == 'learning_curve': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = {} res.update(learner_result_dict_base) res.update({ 'learning_curve_cv_folds': learning_curve_cv_folds, 'given_curve_train_sizes': learning_curve_train_sizes, 'learning_curve_train_scores_means': np.mean(curve_train_scores, axis=1), 'learning_curve_test_scores_means': np.mean(curve_test_scores, axis=1), 'learning_curve_train_scores_stds': np.std(curve_train_scores, axis=1, ddof=1), 'learning_curve_test_scores_stds': np.std(curve_test_scores, axis=1, ddof=1), 'computed_curve_train_sizes': computed_curve_train_sizes }) # we need to return and write out a list of dictionaries res = [res] # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) # For all other tasks, i.e. train or predict else: if results_path: results_json_path = join(results_path, '{}.results.json'.format(job_name)) assert len(grid_scores) == 1 assert len(grid_search_cv_results_dicts) == 1 grid_search_cv_results_dict = {"grid_score": grid_scores[0]} grid_search_cv_results_dict["grid_search_cv_results"] = \ grid_search_cv_results_dicts[0] grid_search_cv_results_dict.update(learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(grid_search_cv_results_dict, json_file, cls=NumpyTypeEncoder) res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' with open(join(results_path, skll_fold_ids_file), 'w') as output_file: _write_skll_folds(skll_fold_ids, output_file) finally: close_and_remove_logger_handlers(logger) return res
def test_register_custom_metric_missing_name(): """Test loading custom metric from empty string""" # try to load a metric from a missing file name # which can happen via a bad configuration file register_custom_metric("", "f075_macro")