def check_train_and_score_function(model_type): """ Check that the _train_and_score() function works as expected """ # create train and test data (train_fs, test_fs) = make_classification_data(num_examples=500, train_test_ratio=0.7, num_features=5, use_feature_hashing=False, non_negative=True) # call _train_and_score() on this data estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge' metric = 'accuracy' if model_type == 'classifier' else 'pearson' learner1 = Learner(estimator_name) train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric) # this should yield identical results when training another instance # of the same learner without grid search and shuffling and evaluating # that instance on the train and the test set learner2 = Learner(estimator_name) learner2.train(train_fs, grid_search=False, shuffle=False) train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric] test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric] eq_(train_score1, train_score2) eq_(test_score1, test_score2)
def check_invalid_regression_metric(learner, metric, by_itself=False): """ Checks that invalid metrics raise exceptions """ (train_fs, test_fs, _) = make_regression_data() clf = Learner(learner) clf.train(train_fs, grid_search=False) output_metrics = [metric] if by_itself else ['pearson', metric] clf.evaluate(test_fs, output_metrics=output_metrics)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data( use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs) test_output = learner.evaluate(test_fs) fmeasures = [test_output[2][0]['F-measure'], test_output[2][1]['F-measure']] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([0.7979797979797979, 0.80198019801980192] if not use_scaling else [0.94883720930232551, 0.94054054054054048]) else: expected_fmeasures = ([0.83962264150943389, 0.81914893617021278] if not use_scaling else [0.88038277511961716, 0.86910994764397898]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data(use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs) test_output = learner.evaluate(test_fs) fmeasures = [test_output[2][0]['F-measure'], test_output[2][1]['F-measure']] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([0.77319587628865982, 0.78640776699029125] if not use_scaling else [0.94930875576036866, 0.93989071038251359]) else: expected_fmeasures = ([0.42774566473988435, 0.5638766519823788] if not use_scaling else [0.87323943661971837, 0.85561497326203206]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data( use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs, grid_search=True, grid_objective='f1_score_micro') test_output = learner.evaluate(test_fs) fmeasures = [ test_output[2][0]['F-measure'], test_output[2][1]['F-measure'] ] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([ 0.55276381909547745, 0.55721393034825872 ] if not use_scaling else [0.65217391304347827, 0.70370370370370372]) else: expected_fmeasures = ([ 0.54255319148936176, 0.59433962264150941 ] if not use_scaling else [0.69950738916256161, 0.69035532994923865]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data(use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs) test_output = learner.evaluate(test_fs) fmeasures = [test_output[2][0]['F-measure'], test_output[2][1]['F-measure']] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([0.55276381909547745, 0.55721393034825872] if not use_scaling else [0.65217391304347827, 0.70370370370370372]) else: expected_fmeasures = ([0.54255319148936176, 0.59433962264150941] if not use_scaling else [0.69950738916256161, 0.69035532994923865]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data( use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs, grid_search=True, grid_objective='f1_score_micro') test_output = learner.evaluate(test_fs) fmeasures = [ test_output[2][0]['F-measure'], test_output[2][1]['F-measure'] ] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([ 0.6699507389162562, 0.6598984771573605 ] if not use_scaling else [0.7058823529411765, 0.7417840375586855]) else: expected_fmeasures = ([ 0.5288461538461539, 0.4895833333333333 ] if not use_scaling else [0.632183908045977, 0.7168141592920354]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_scaling_features(use_feature_hashing=False, use_scaling=False): train_fs, test_fs = make_scaling_data( use_feature_hashing=use_feature_hashing) # create a Linear SVM with the value of scaling as specified feature_scaling = 'both' if use_scaling else 'none' learner = Learner('SGDClassifier', feature_scaling=feature_scaling, pos_label_str=1) # train the learner on the training set and test on the testing set learner.train(train_fs, grid_search=True, grid_objective='f1_score_micro') test_output = learner.evaluate(test_fs) fmeasures = [ test_output[2][0]['F-measure'], test_output[2][1]['F-measure'] ] # these are the expected values of the f-measures, sorted if not use_feature_hashing: expected_fmeasures = ([ 0.5333333333333333, 0.4842105263157895 ] if not use_scaling else [0.7219512195121951, 0.7076923076923077]) else: expected_fmeasures = ([ 0.5288461538461539, 0.4895833333333333 ] if not use_scaling else [0.663157894736842, 0.6952380952380952]) assert_almost_equal(expected_fmeasures, fmeasures)
def check_sparse_predict(learner_name, expected_score, use_feature_hashing=False): train_fs, test_fs = make_sparse_data( use_feature_hashing=use_feature_hashing) # train the given classifier on the training # data and evalute on the testing data learner = Learner(learner_name) learner.train(train_fs, grid_search=False) test_score = learner.evaluate(test_fs)[1] assert_almost_equal(test_score, expected_score)
def check_sparse_predict(use_feature_hashing=False): train_fs, test_fs = make_sparse_data( use_feature_hashing=use_feature_hashing) # train a linear SVM on the training data and evalute on the testing data learner = Learner('LogisticRegression') learner.train(train_fs, grid_search=False) test_score = learner.evaluate(test_fs)[1] expected_score = 0.51 if use_feature_hashing else 0.45 assert_almost_equal(test_score, expected_score)
def check_adaboost_predict(base_estimator, algorithm, expected_score): train_fs, test_fs = make_sparse_data() # train an AdaBoostClassifier on the training data and evalute on the # testing data learner = Learner('AdaBoostClassifier', model_kwargs={'base_estimator': base_estimator, 'algorithm': algorithm}) learner.train(train_fs, grid_search=False) test_score = learner.evaluate(test_fs)[1] assert_almost_equal(test_score, expected_score)
def test_new_labels_in_test_set(): """ Test classification experiment with an unseen label in the test set. """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # add new labels to the test set test_fs.labels[-3:] = 3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [3] yield assert_almost_equal, res[1], 0.3
def test_all_new_labels_in_test(): """ Test classification with all labels in test set unseen """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change all test labels test_fs.labels = test_fs.labels + 3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 6, [3, 4, 5] yield assert_almost_equal, res[1], 0
def test_all_new_labels_in_test(): """ Test classification with all labels in test set unseen """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change all test labels test_fs.labels = test_fs.labels+3 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 6, [3, 4, 5] yield assert_almost_equal, res[1], 0
def test_new_labels_in_test_set_change_order(): """ Test classification with an unseen label in the test set when the new label falls between the existing labels """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change train labels to create a gap train_fs.labels = train_fs.labels * 10 # add new test labels test_fs.labels = test_fs.labels * 10 test_fs.labels[-3:] = 15 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [15] yield assert_almost_equal, res[1], 0.3
def test_new_labels_in_test_set_change_order(): """ Test classification with an unseen label in the test set when the new label falls between the existing labels """ train_fs, test_fs = make_classification_data(num_labels=3, train_test_ratio=0.8) # change train labels to create a gap train_fs.labels = train_fs.labels*10 # add new test labels test_fs.labels = test_fs.labels*10 test_fs.labels[-3:] = 15 learner = Learner('SVC') learner.train(train_fs, grid_search=False) res = learner.evaluate(test_fs) yield check_results_with_unseen_labels, res, 4, [15] yield assert_almost_equal, res[1], 0.3
def test_additional_metrics(): """ Test additional metrics in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_search=True, grid_objective='pearson') # evaluate the trained model using the test feature set results = learner.evaluate(test_fs, output_metrics=['spearman', 'kendall_tau']) # check that the values for the additional metrics are as expected additional_scores_dict = results[-1] assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4) assert_almost_equal(additional_scores_dict['kendall_tau'], 0.9846, places=4)
def test_additional_metrics(): """ Test additional metrics in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_search=True, grid_objective='pearson') # evaluate the trained model using the test feature set results = learner.evaluate(test_fs, output_metrics=['spearman', 'kendall_tau']) # check that the values for the additional metrics are as expected additional_scores_dict = results[-1] assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4) assert_almost_equal(additional_scores_dict['kendall_tau'], 0.9847, places=4)
def check_sparse_predict_sampler(use_feature_hashing=False): train_fs, test_fs = make_sparse_data( use_feature_hashing=use_feature_hashing) if use_feature_hashing: sampler = 'RBFSampler' sampler_parameters = {"gamma": 1.0, "n_components": 50} else: sampler = 'Nystroem' sampler_parameters = {"gamma": 1.0, "n_components": 50, "kernel": 'rbf'} learner = Learner('LogisticRegression', sampler=sampler, sampler_kwargs=sampler_parameters) learner.train(train_fs, grid_search=False) test_score = learner.evaluate(test_fs)[1] expected_score = 0.48 if use_feature_hashing else 0.45 assert_almost_equal(test_score, expected_score)
def test_fancy_output(): """ Test the descriptive statistics output in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_search=True, grid_objective='pearson') # evaluate the trained model using the test feature set resultdict = learner.evaluate(test_fs) actual_stats_from_api = dict(resultdict[2]['descriptive']['actual']) pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted']) # write out the training and test feature set train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'), train_fs) train_writer.write() test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs) test_writer.write() # now get the config file template, fill it in and run it # so that we can get a results file config_template_path = join(_my_dir, 'configs', 'test_regression_fancy_output.template.cfg') config_path = fill_in_config_paths_for_fancy_output(config_template_path) run_configuration(config_path, quiet=True) # read in the results file and get the descriptive statistics actual_stats_from_file = {} pred_stats_from_file = {} with open( join(output_dir, ('regression_fancy_output_train_fancy_train.' 'jsonlines_test_fancy_test.jsonlines' '_LinearRegression.results')), 'r') as resultf: result_output = resultf.read().strip().split('\n') for desc_stat_line in result_output[26:30]: desc_stat_line = desc_stat_line.strip() if not desc_stat_line: continue else: m = re.search( r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+' r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+' r'\((predicted)\)', desc_stat_line) stat_type, actual_value, _, pred_value, _ = m.groups() actual_stats_from_file[stat_type.lower()] = float(actual_value) pred_stats_from_file[stat_type.lower()] = float(pred_value) for stat_type in actual_stats_from_api: assert_almost_equal(actual_stats_from_file[stat_type], actual_stats_from_api[stat_type], places=4) assert_almost_equal(pred_stats_from_file[stat_type], pred_stats_from_api[stat_type], places=4)
def test_fancy_output(): """ Test the descriptive statistics output in the results file for a regressor """ train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # train a regression model using the train feature set learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # evaluate the trained model using the test feature set resultdict = learner.evaluate(test_fs) actual_stats_from_api = dict(resultdict[2]['descriptive']['actual']) pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted']) # write out the training and test feature set train_dir = join(_my_dir, 'train') test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'), train_fs) train_writer.write() test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs) test_writer.write() # now get the config file template, fill it in and run it # so that we can get a results file config_template_path = join(_my_dir, 'configs', 'test_regression_fancy_output.template.cfg') config_path = fill_in_config_paths_for_fancy_output(config_template_path) run_configuration(config_path, quiet=True) # read in the results file and get the descriptive statistics actual_stats_from_file = {} pred_stats_from_file = {} with open(join(output_dir, ('regression_fancy_output_train_fancy_train.' 'jsonlines_test_fancy_test.jsonlines' '_LinearRegression.results')), 'r') as resultf: result_output = resultf.read().strip().split('\n') for desc_stat_line in result_output[27:31]: desc_stat_line = desc_stat_line.strip() if not desc_stat_line: continue else: m = re.search(r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+' r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+' r'\((predicted)\)', desc_stat_line) stat_type, actual_value, _, pred_value, _ = m.groups() actual_stats_from_file[stat_type.lower()] = float(actual_value) pred_stats_from_file[stat_type.lower()] = float(pred_value) for stat_type in actual_stats_from_api: assert_almost_equal(actual_stats_from_file[stat_type], actual_stats_from_api[stat_type], places=4) assert_almost_equal(pred_stats_from_file[stat_type], pred_stats_from_api[stat_type], places=4)
def _classify_featureset(args): """ Classification job to be submitted to grid. Parameters ---------- args : dict A dictionary with arguments for classifying the ``FeatureSet`` instance. Returns ------- res : list of dicts The results of the classification, in the format of a list of dictionaries. Raises ------ ValueError If extra unknown arguments are passed to the function. """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") output_metrics = args.pop("output_metrics") suffix = args.pop("suffix") job_log_file = args.pop("log_file") job_log_level = args.pop("log_level") probability = args.pop("probability") pipeline = args.pop("pipeline") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") folds_file = args.pop("folds_file") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") save_cv_models = args.pop("save_cv_models") use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") custom_metric_path = args.pop("custom_metric_path") quiet = args.pop('quiet', False) learning_curve_cv_folds = args.pop("learning_curve_cv_folds") learning_curve_train_sizes = args.pop("learning_curve_train_sizes") if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() # create a new SKLL logger for this specific job and # use the given log level logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level) try: # log messages logger.info("Task: {}".format(task)) # check if we have any possible custom metrics possible_custom_metric_names = [] for metric_name in output_metrics + [grid_objective]: # metrics that are not in `SCORERS` or `None` are candidates # (the `None` is a by-product of how jobs with single tuning # objectives are created) if metric_name not in SCORERS and metric_name is not None: possible_custom_metric_names.append(metric_name) # if the metric is already in `SCORERS`, is it a custom one # that we already registered? if so, log that elif metric_name in _CUSTOM_METRICS: logger.info( f"custom metric '{metric_name}' is already registered") # initialize list that will hold any invalid metrics # that we could not register as custom metrics invalid_metric_names = [] # if we have possible custom metrics if possible_custom_metric_names: # check that we have a file to load them from if not custom_metric_path: raise ValueError( f"invalid metrics specified: {possible_custom_metric_names}" ) else: # try to register each possible custom metric # raise an exception if we fail, if we don't then # add the custom metric function to `globals()` so # that it serializes properly for gridmap for custom_metric_name in possible_custom_metric_names: try: custom_metric_func = register_custom_metric( custom_metric_path, custom_metric_name) except (AttributeError, NameError, ValueError): invalid_metric_names.append(custom_metric_name) else: logger.info(f"registered '{custom_metric_name}' as a " f"custom metric") globals()[custom_metric_name] = custom_metric_func # raise an error if we have any invalid metrics if invalid_metric_names: raise ValueError( f"invalid metrics specified: {invalid_metric_names}. " f"If these are custom metrics, check the function " f"names.") if task == 'cross_validate': if isinstance(cv_folds, int): num_folds = cv_folds else: # folds_file was used, so count the unique fold ids. num_folds = len(set(cv_folds.values())) logger.info("Cross-validating ({} folds) on {}, feature " "set {} ...".format(num_folds, train_set_name, featureset)) elif task == 'evaluate': logger.info("Training on {}, Test on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) elif task == 'train': logger.info("Training on {}, feature set {} ...".format( train_set_name, featureset)) elif task == 'learning_curve': logger.info("Generating learning curve " "({} 80/20 folds, sizes={}, objective={}) on {}, " "feature set {} ...".format( learning_curve_cv_folds, learning_curve_train_sizes, grid_objective, train_set_name, featureset)) else: # predict logger.info("Training on {}, Making predictions on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if (task in ['cross_validate', 'learning_curve'] or not exists(modelfile) or overwrite): train_examples = load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features, logger=logger) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, pipeline=pipeline, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path, logger=logger) # load the model if it already exists else: # import custom learner into global namespace if we are reusing # a saved model if custom_learner_path: globals()[learner_name] = load_custom_learner( custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: logger.info("Loading pre-existing {} model: {}".format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # attach the job logger to this learner learner.logger = logger # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # compute information about xval and grid folds that can be put in results # in readable form if isinstance(cv_folds, dict): cv_folds_to_print = '{} via folds file'.format( len(set(cv_folds.values()))) else: cv_folds_to_print = str(cv_folds) if isinstance(grid_search_folds, dict): grid_search_folds_to_print = \ '{} via folds file'.format(len(set(grid_search_folds.values()))) else: grid_search_folds_to_print = str(grid_search_folds) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'folds_file': folds_file, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds_to_print, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds_to_print, 'using_folds_file': isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict), 'save_cv_folds': save_cv_folds, 'save_cv_models': save_cv_models, 'use_folds_file_for_grid_search': use_folds_file_for_grid_search, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': logger.info('Cross-validating') ( task_results, grid_scores, grid_search_cv_results_dicts, skll_fold_ids, models ) = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, output_metrics=output_metrics, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds, save_cv_models=save_cv_models, use_custom_folds_for_grid_search=use_folds_file_for_grid_search ) if models: for index, m in enumerate(models, start=1): modelfile = join(model_path, '{}_fold{}.model'.format(job_name, index)) m.save(modelfile) elif task == 'learning_curve': logger.info("Generating learning curve(s)") (curve_train_scores, curve_test_scores, computed_curve_train_sizes) = learner.learning_curve( train_examples, grid_objective, cv_folds=learning_curve_cv_folds, train_sizes=learning_curve_train_sizes) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: logger.info("Featurizing and training new {} model".format( learner_name)) (best_score, grid_search_cv_results) = learner.train( train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] grid_search_cv_results_dicts = [grid_search_cv_results] # save model if model_path: learner.save(modelfile) if grid_search: logger.info("Best {} grid search score: {}".format( grid_objective, round(best_score, 3))) else: grid_scores = [None] grid_search_cv_results_dicts = [None] # print out the parameters param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in learner.model.get_params().items()) logger.info("Hyperparameters: {}".format(', '.join(param_out))) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': logger.info("Evaluating predictions") task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective, output_metrics=output_metrics) ] elif task == 'predict': logger.info("Writing predictions") # we set `class_labels` to `False` so that if the learner is # probabilistic, probabilities are written instead of labels learner.predict(test_examples, prediction_prefix=prediction_prefix, class_labels=False) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, grid_search_cv_results_dicts, learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) elif task == 'learning_curve': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = {} res.update(learner_result_dict_base) res.update({ 'learning_curve_cv_folds': learning_curve_cv_folds, 'given_curve_train_sizes': learning_curve_train_sizes, 'learning_curve_train_scores_means': np.mean(curve_train_scores, axis=1), 'learning_curve_test_scores_means': np.mean(curve_test_scores, axis=1), 'learning_curve_train_scores_stds': np.std(curve_train_scores, axis=1, ddof=1), 'learning_curve_test_scores_stds': np.std(curve_test_scores, axis=1, ddof=1), 'computed_curve_train_sizes': computed_curve_train_sizes }) # we need to return and write out a list of dictionaries res = [res] # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) # For all other tasks, i.e. train or predict else: if results_path: results_json_path = join(results_path, '{}.results.json'.format(job_name)) assert len(grid_scores) == 1 assert len(grid_search_cv_results_dicts) == 1 grid_search_cv_results_dict = {"grid_score": grid_scores[0]} grid_search_cv_results_dict["grid_search_cv_results"] = \ grid_search_cv_results_dicts[0] grid_search_cv_results_dict.update(learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(grid_search_cv_results_dict, json_file, cls=NumpyTypeEncoder) res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' with open(join(results_path, skll_fold_ids_file), 'w') as output_file: _write_skll_folds(skll_fold_ids, output_file) finally: close_and_remove_logger_handlers(logger) return res
def _classify_featureset(args): """ Classification job to be submitted to grid """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating ({} folds) on {}, feature " + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if task == 'cross_validate' or (not exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path) # load the model if it already exists else: # import the custom learner path here in case we are reusing a # saved model if custom_learner_path: _import_custom_learner(custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: print(('\tloading pre-existing %s model: %s') % (learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def _classify_featureset(args): ''' Classification job to be submitted to grid ''' # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") job_name = args.pop("job_name") featureset = args.pop("featureset") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") cv_folds = args.pop("cv_folds") label_col = args.pop("label_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " + "{}").format(args.keys())) timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S') with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating on {}, feature " + "set {} ...").format(train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count) # load the model if it already exists else: if os.path.exists(modelfile) and not overwrite: print(('\tloading pre-existing {} ' + 'model: {}').format(learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, unlabelled=True) # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'test_set_name': test_set_name, 'featureset': json.dumps(featureset), 'learner_name': learner_name, 'task': task, 'timestamp': timestamp, 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'min_feature_count': min_feature_count} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate(train_examples, prediction_prefix=prediction_prefix, grid_search=grid_search, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) grid_search_folds = 5 if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train if task == 'cross_validate' or task == 'evaluate': results_json_path = os.path.join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file) with open(os.path.join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def _classify_featureset(args): """ Classification job to be submitted to grid """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating ({} folds) on {}, feature " + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if task == 'cross_validate' or (not exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path) # load the model if it already exists else: # import the custom learner path here in case we are reusing a # saved model if custom_learner_path: _import_custom_learner(custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: print(('\tloading pre-existing %s model: %s') % (learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, 'save_cv_folds': save_cv_folds, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores, skll_fold_ids = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(join(results_path, skll_fold_ids_file), file_mode) as output_file: _write_skll_folds(skll_fold_ids, output_file) return res
def _classify_featureset(args): ''' Classification job to be submitted to grid ''' # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") job_name = args.pop("job_name") featureset = args.pop("featureset") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") cv_folds = args.pop("cv_folds") label_col = args.pop("label_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " + "{}").format(args.keys())) timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S') with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating on {}, feature " + "set {} ...").format( train_set_name, featureset), file=log_file) elif task == 'evaluate': print( ("Training on {}, Test on {}, " + "feature set {} ...").format( train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format( train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count) # load the model if it already exists else: if os.path.exists(modelfile) and not overwrite: print(('\tloading pre-existing {} ' + 'model: {}').format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, unlabelled=True) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'test_set_name': test_set_name, 'featureset': json.dumps(featureset), 'learner_name': learner_name, 'task': task, 'timestamp': timestamp, 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'min_feature_count': min_feature_count } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( train_examples, prediction_prefix=prediction_prefix, grid_search=grid_search, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) grid_search_folds = 5 if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}'.format( grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems( learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective) ] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train if task == 'cross_validate' or task == 'evaluate': results_json_path = os.path.join( results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file) with open( os.path.join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res