def test_predict_on_subset_with_existing_model(): """ Test generating predictions on subset with existing model """ # Create data files make_single_file_featureset_data() # train and save a model on the training file train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read() learner = Learner('RandomForestClassifier') learner.train(train_fs, grid_search=True, grid_objective="accuracy") model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.model')) learner.save(model_filename) # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file_saved_subset" ".template.cfg"), join(_my_dir, 'train', 'train_single_file.jsonlines'), join(_my_dir, 'test', 'test_single_file_subset.' 'jsonlines')) run_configuration(config_path, quiet=True, overwrite=False) # Check results with open(join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.results.json'))) as f: result_dict = json.load(f)[0] assert_almost_equal(result_dict['accuracy'], 0.7333333)
def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification': train_fs, _ = make_classification_data(train_test_ratio=0.8) else: train_fs, _, _ = make_regression_data(num_features=4, train_test_ratio=0.8) # now train the appropriate model if task == 'classification': learner = Learner('LogisticRegression') learner.train(train_fs) else: learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') # now save the model to disk model_file = join(_my_dir, 'output', 'test_print_model_weights.model') learner.save(model_file) # now call print_model_weights main() and capture the output print_model_weights_cmd = [model_file] err = '' try: old_stderr = sys.stderr old_stdout = sys.stdout sys.stderr = mystderr = StringIO() sys.stdout = mystdout = StringIO() pmw.main(print_model_weights_cmd) out = mystdout.getvalue() err = mystderr.getvalue() finally: sys.stderr = old_stderr sys.stdout = old_stdout print(err) # now parse the output of the print_model_weight command # and get the intercept and the feature values if task == 'classification': lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = safe_float(lines_to_parse[0].split('\t')[0]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_[0]) assert_allclose(learner.model.coef_[0], feature_values) else: lines_to_parse = [l for l in out.split('\n') if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values)
def test_predict_on_subset_with_existing_model(): """ Test generating predictions on subset with existing model """ # Create data files make_single_file_featureset_data() # train and save a model on the training file train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read() learner = Learner('RandomForestClassifier') learner.train(train_fs, grid_search=True, grid_objective="accuracy") model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.model')) learner.save(model_filename) # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", "test_single_file_saved_subset" ".template.cfg"), join(_my_dir, 'train', 'train_single_file.jsonlines'), join(_my_dir, 'test', 'test_single_file_subset.' 'jsonlines')) run_configuration(config_path, quiet=True, overwrite=False) # Check results with open(join(_my_dir, 'output', ('train_test_single_file_train_train_' 'single_file.jsonlines_test_test_single' '_file_subset.jsonlines_RandomForestClassifier' '.results.json'))) as f: result_dict = json.load(f)[0] assert_almost_equal(result_dict['score'], 0.7333333)
def check_generate_predictions_console(use_threshold=False): # create some simple classification data without feature hashing train_fs, test_fs = make_classification_data(num_examples=1000, num_features=5) # save the test feature set to an NDJ file input_file = join(_my_dir, 'test', 'test_generate_predictions.jsonlines') writer = NDJWriter(input_file, test_fs) writer.write() # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions_console.model') learner.save(model_file) # now call main() from generate_predictions.py generate_cmd = [] if use_threshold: generate_cmd.append('-t {}'.format(threshold)) generate_cmd.extend([model_file, input_file]) # we need to capture stdout since that's what main() writes to err = '' try: old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = mystdout = StringIO() sys.stderr = mystderr = StringIO() gp.main(generate_cmd) out = mystdout.getvalue() err = mystderr.getvalue() predictions_after_saving = [int(x) for x in out.strip().split('\n')] eq_(predictions, predictions_after_saving) finally: sys.stdout = old_stdout sys.stderr = old_stderr print(err)
def check_generate_predictions(use_feature_hashing=False, use_threshold=False, test_on_subset=False): # create some simple classification feature sets for training and testing train_fs, test_fs = make_classification_data( num_examples=1000, num_features=5, use_feature_hashing=use_feature_hashing, feature_bins=4) # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # if we are asked to use only a subset, then filter out # one of the features if we are not using feature hashing, # do nothing if we are using feature hashing if test_on_subset and not use_feature_hashing: test_fs.filter(features=['f01', 'f02', 'f03', 'f04']) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions.model') learner.save(model_file) # now use Predictor to generate the predictions and make # sure that they are the same as before saving the model p = gp.Predictor(model_file, threshold=threshold) predictions_after_saving = p.predict(test_fs) eq_(predictions, predictions_after_saving)
def check_generate_predictions(use_feature_hashing=False, use_threshold=False): # create some simple classification data without feature hashing train_fs, test_fs = make_classification_data( num_examples=1000, num_features=5, use_feature_hashing=use_feature_hashing, feature_bins=4) # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions.model') learner.save(model_file) # now use Predictor to generate the predictions and make # sure that they are the same as before saving the model p = gp.Predictor(model_file, threshold=threshold) predictions_after_saving = p.predict(test_fs) eq_(predictions, predictions_after_saving)
def _classify_featureset(args): """ Classification job to be submitted to grid. Parameters ---------- args : dict A dictionary with arguments for classifying the ``FeatureSet`` instance. Returns ------- res : list of dicts The results of the classification, in the format of a list of dictionaries. Raises ------ ValueError If extra unknown arguments are passed to the function. """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") output_metrics = args.pop("output_metrics") suffix = args.pop("suffix") job_log_file = args.pop("log_file") job_log_level = args.pop("log_level") probability = args.pop("probability") pipeline = args.pop("pipeline") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") folds_file = args.pop("folds_file") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") save_cv_models = args.pop("save_cv_models") use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") custom_metric_path = args.pop("custom_metric_path") quiet = args.pop('quiet', False) learning_curve_cv_folds = args.pop("learning_curve_cv_folds") learning_curve_train_sizes = args.pop("learning_curve_train_sizes") if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() # create a new SKLL logger for this specific job and # use the given log level logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level) try: # log messages logger.info("Task: {}".format(task)) # check if we have any possible custom metrics possible_custom_metric_names = [] for metric_name in output_metrics + [grid_objective]: # metrics that are not in `SCORERS` or `None` are candidates # (the `None` is a by-product of how jobs with single tuning # objectives are created) if metric_name not in SCORERS and metric_name is not None: possible_custom_metric_names.append(metric_name) # if the metric is already in `SCORERS`, is it a custom one # that we already registered? if so, log that elif metric_name in _CUSTOM_METRICS: logger.info( f"custom metric '{metric_name}' is already registered") # initialize list that will hold any invalid metrics # that we could not register as custom metrics invalid_metric_names = [] # if we have possible custom metrics if possible_custom_metric_names: # check that we have a file to load them from if not custom_metric_path: raise ValueError( f"invalid metrics specified: {possible_custom_metric_names}" ) else: # try to register each possible custom metric # raise an exception if we fail, if we don't then # add the custom metric function to `globals()` so # that it serializes properly for gridmap for custom_metric_name in possible_custom_metric_names: try: custom_metric_func = register_custom_metric( custom_metric_path, custom_metric_name) except (AttributeError, NameError, ValueError): invalid_metric_names.append(custom_metric_name) else: logger.info(f"registered '{custom_metric_name}' as a " f"custom metric") globals()[custom_metric_name] = custom_metric_func # raise an error if we have any invalid metrics if invalid_metric_names: raise ValueError( f"invalid metrics specified: {invalid_metric_names}. " f"If these are custom metrics, check the function " f"names.") if task == 'cross_validate': if isinstance(cv_folds, int): num_folds = cv_folds else: # folds_file was used, so count the unique fold ids. num_folds = len(set(cv_folds.values())) logger.info("Cross-validating ({} folds) on {}, feature " "set {} ...".format(num_folds, train_set_name, featureset)) elif task == 'evaluate': logger.info("Training on {}, Test on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) elif task == 'train': logger.info("Training on {}, feature set {} ...".format( train_set_name, featureset)) elif task == 'learning_curve': logger.info("Generating learning curve " "({} 80/20 folds, sizes={}, objective={}) on {}, " "feature set {} ...".format( learning_curve_cv_folds, learning_curve_train_sizes, grid_objective, train_set_name, featureset)) else: # predict logger.info("Training on {}, Making predictions on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if (task in ['cross_validate', 'learning_curve'] or not exists(modelfile) or overwrite): train_examples = load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features, logger=logger) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, pipeline=pipeline, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path, logger=logger) # load the model if it already exists else: # import custom learner into global namespace if we are reusing # a saved model if custom_learner_path: globals()[learner_name] = load_custom_learner( custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: logger.info("Loading pre-existing {} model: {}".format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # attach the job logger to this learner learner.logger = logger # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # compute information about xval and grid folds that can be put in results # in readable form if isinstance(cv_folds, dict): cv_folds_to_print = '{} via folds file'.format( len(set(cv_folds.values()))) else: cv_folds_to_print = str(cv_folds) if isinstance(grid_search_folds, dict): grid_search_folds_to_print = \ '{} via folds file'.format(len(set(grid_search_folds.values()))) else: grid_search_folds_to_print = str(grid_search_folds) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'folds_file': folds_file, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds_to_print, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds_to_print, 'using_folds_file': isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict), 'save_cv_folds': save_cv_folds, 'save_cv_models': save_cv_models, 'use_folds_file_for_grid_search': use_folds_file_for_grid_search, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': logger.info('Cross-validating') ( task_results, grid_scores, grid_search_cv_results_dicts, skll_fold_ids, models ) = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, output_metrics=output_metrics, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds, save_cv_models=save_cv_models, use_custom_folds_for_grid_search=use_folds_file_for_grid_search ) if models: for index, m in enumerate(models, start=1): modelfile = join(model_path, '{}_fold{}.model'.format(job_name, index)) m.save(modelfile) elif task == 'learning_curve': logger.info("Generating learning curve(s)") (curve_train_scores, curve_test_scores, computed_curve_train_sizes) = learner.learning_curve( train_examples, grid_objective, cv_folds=learning_curve_cv_folds, train_sizes=learning_curve_train_sizes) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: logger.info("Featurizing and training new {} model".format( learner_name)) (best_score, grid_search_cv_results) = learner.train( train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] grid_search_cv_results_dicts = [grid_search_cv_results] # save model if model_path: learner.save(modelfile) if grid_search: logger.info("Best {} grid search score: {}".format( grid_objective, round(best_score, 3))) else: grid_scores = [None] grid_search_cv_results_dicts = [None] # print out the parameters param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in learner.model.get_params().items()) logger.info("Hyperparameters: {}".format(', '.join(param_out))) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': logger.info("Evaluating predictions") task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective, output_metrics=output_metrics) ] elif task == 'predict': logger.info("Writing predictions") # we set `class_labels` to `False` so that if the learner is # probabilistic, probabilities are written instead of labels learner.predict(test_examples, prediction_prefix=prediction_prefix, class_labels=False) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, grid_search_cv_results_dicts, learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) elif task == 'learning_curve': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = {} res.update(learner_result_dict_base) res.update({ 'learning_curve_cv_folds': learning_curve_cv_folds, 'given_curve_train_sizes': learning_curve_train_sizes, 'learning_curve_train_scores_means': np.mean(curve_train_scores, axis=1), 'learning_curve_test_scores_means': np.mean(curve_test_scores, axis=1), 'learning_curve_train_scores_stds': np.std(curve_train_scores, axis=1, ddof=1), 'learning_curve_test_scores_stds': np.std(curve_test_scores, axis=1, ddof=1), 'computed_curve_train_sizes': computed_curve_train_sizes }) # we need to return and write out a list of dictionaries res = [res] # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) # For all other tasks, i.e. train or predict else: if results_path: results_json_path = join(results_path, '{}.results.json'.format(job_name)) assert len(grid_scores) == 1 assert len(grid_search_cv_results_dicts) == 1 grid_search_cv_results_dict = {"grid_score": grid_scores[0]} grid_search_cv_results_dict["grid_search_cv_results"] = \ grid_search_cv_results_dicts[0] grid_search_cv_results_dict.update(learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(grid_search_cv_results_dict, json_file, cls=NumpyTypeEncoder) res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' with open(join(results_path, skll_fold_ids_file), 'w') as output_file: _write_skll_folds(skll_fold_ids, output_file) finally: close_and_remove_logger_handlers(logger) return res
def _classify_featureset(args): """ Classification job to be submitted to grid """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating ({} folds) on {}, feature " + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if task == 'cross_validate' or (not exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path) # load the model if it already exists else: # import the custom learner path here in case we are reusing a # saved model if custom_learner_path: _import_custom_learner(custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: print(('\tloading pre-existing %s model: %s') % (learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification' or task == 'classification_no_intercept': train_fs, _ = make_classification_data(train_test_ratio=0.8) elif task == 'multiclass_classification': train_fs, _ = make_classification_data(train_test_ratio=0.8, num_labels=3) else: train_fs, _, _ = make_regression_data(num_features=4, train_test_ratio=0.8) # now train the appropriate model if task == 'classification' or task == 'multiclass_classification': learner = Learner('LogisticRegression') learner.train(train_fs, grid_objective='f1_score_micro') elif task == 'classification_no_intercept': learner = Learner('LogisticRegression') learner.train(train_fs, grid_objective='f1_score_micro', param_grid=[{'fit_intercept':[False]}]) elif task == 'regression': learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') else: learner = Learner('LinearSVR') learner.train(train_fs, grid_objective='pearson') # now save the model to disk model_file = join(_my_dir, 'output', 'test_print_model_weights.model') learner.save(model_file) # now call print_model_weights main() and capture the output print_model_weights_cmd = [model_file] err = '' try: old_stderr = sys.stderr old_stdout = sys.stdout sys.stderr = mystderr = StringIO() sys.stdout = mystdout = StringIO() pmw.main(print_model_weights_cmd) out = mystdout.getvalue() err = mystderr.getvalue() finally: sys.stderr = old_stderr sys.stdout = old_stdout print(err) # now parse the output of the print_model_weight command # and get the intercept and the feature values if task == 'classification': lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = safe_float(lines_to_parse[0].split('\t')[0]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_[0]) assert_allclose(learner.model.coef_[0], feature_values) elif task == 'multiclass_classification': # for multiple classes we get an intercept for each class # as well as a list of weights for each class lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = [] for intercept_string in lines_to_parse[0:3]: intercept.append(safe_float(intercept_string.split('\t')[0])) feature_values = [[], [], []] for ltp in lines_to_parse[3:]: fields = ltp.split('\t') feature_values[int(fields[1])].append((fields[2], safe_float(fields[0]))) for index, weights in enumerate(feature_values): feature_values[index] = [t[1] for t in sorted(weights)] for index, weights in enumerate(learner.model.coef_): assert_array_almost_equal(weights, feature_values[index]) assert_array_almost_equal(intercept, learner.model.intercept_) elif task == 'classification_no_intercept': lines_to_parse = [l for l in out.split('\n')[0:] if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_[0], feature_values) elif task == 'regression': lines_to_parse = [l for l in out.split('\n') if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values) else: lines_to_parse = [l for l in out.split('\n') if l] intercept_list = ast.literal_eval(lines_to_parse[0].split('=')[1].strip()) intercept = [] for intercept_string in intercept_list: intercept.append(safe_float(intercept_string)) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_array_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values)
def _classify_featureset(args): """ Classification job to be submitted to grid """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating ({} folds) on {}, feature " + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if task == 'cross_validate' or (not exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path) # load the model if it already exists else: # import the custom learner path here in case we are reusing a # saved model if custom_learner_path: _import_custom_learner(custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: print(('\tloading pre-existing %s model: %s') % (learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, 'save_cv_folds': save_cv_folds, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores, skll_fold_ids = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(join(results_path, skll_fold_ids_file), file_mode) as output_file: _write_skll_folds(skll_fold_ids, output_file) return res
def _classify_featureset(args): ''' Classification job to be submitted to grid ''' # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") job_name = args.pop("job_name") featureset = args.pop("featureset") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") cv_folds = args.pop("cv_folds") label_col = args.pop("label_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " + "{}").format(args.keys())) timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S') with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating on {}, feature " + "set {} ...").format( train_set_name, featureset), file=log_file) elif task == 'evaluate': print( ("Training on {}, Test on {}, " + "feature set {} ...").format( train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format( train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count) # load the model if it already exists else: if os.path.exists(modelfile) and not overwrite: print(('\tloading pre-existing {} ' + 'model: {}').format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, unlabelled=True) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'test_set_name': test_set_name, 'featureset': json.dumps(featureset), 'learner_name': learner_name, 'task': task, 'timestamp': timestamp, 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'min_feature_count': min_feature_count } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( train_examples, prediction_prefix=prediction_prefix, grid_search=grid_search, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) grid_search_folds = 5 if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}'.format( grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems( learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective) ] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train if task == 'cross_validate' or task == 'evaluate': results_json_path = os.path.join( results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file) with open( os.path.join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def _classify_featureset(args): ''' Classification job to be submitted to grid ''' # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") job_name = args.pop("job_name") featureset = args.pop("featureset") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") cv_folds = args.pop("cv_folds") label_col = args.pop("label_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " + "{}").format(args.keys())) timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S') with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating on {}, feature " + "set {} ...").format(train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count) # load the model if it already exists else: if os.path.exists(modelfile) and not overwrite: print(('\tloading pre-existing {} ' + 'model: {}').format(learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, unlabelled=True) # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'test_set_name': test_set_name, 'featureset': json.dumps(featureset), 'learner_name': learner_name, 'task': task, 'timestamp': timestamp, 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'min_feature_count': min_feature_count} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate(train_examples, prediction_prefix=prediction_prefix, grid_search=grid_search, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) grid_search_folds = 5 if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train if task == 'cross_validate' or task == 'evaluate': results_json_path = os.path.join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file) with open(os.path.join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification': train_fs, _ = make_classification_data(train_test_ratio=0.8) elif task == 'multiclass_classification': train_fs, _ = make_classification_data(train_test_ratio=0.8, num_labels=3) else: train_fs, _, _ = make_regression_data(num_features=4, train_test_ratio=0.8) # now train the appropriate model if task == 'classification' or task == 'multiclass_classification': learner = Learner('LogisticRegression') learner.train(train_fs, grid_objective='f1_score_micro') elif task == 'regression': learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') else: learner = Learner('LinearSVR') learner.train(train_fs, grid_objective='pearson') # now save the model to disk model_file = join(_my_dir, 'output', 'test_print_model_weights.model') learner.save(model_file) # now call print_model_weights main() and capture the output print_model_weights_cmd = [model_file] err = '' try: old_stderr = sys.stderr old_stdout = sys.stdout sys.stderr = mystderr = StringIO() sys.stdout = mystdout = StringIO() pmw.main(print_model_weights_cmd) out = mystdout.getvalue() err = mystderr.getvalue() finally: sys.stderr = old_stderr sys.stdout = old_stdout print(err) # now parse the output of the print_model_weight command # and get the intercept and the feature values if task == 'classification': lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = safe_float(lines_to_parse[0].split('\t')[0]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[2], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_[0]) assert_allclose(learner.model.coef_[0], feature_values) elif task == 'multiclass_classification': # for multiple classes we get an intercept for each class # as well as a list of weights for each class lines_to_parse = [l for l in out.split('\n')[1:] if l] intercept = [] for intercept_string in lines_to_parse[0:3]: intercept.append(safe_float(intercept_string.split('\t')[0])) feature_values = [[], [], []] for ltp in lines_to_parse[3:]: fields = ltp.split('\t') feature_values[int(fields[1])].append((fields[2], safe_float(fields[0]))) for index, weights in enumerate(feature_values): feature_values[index] = [t[1] for t in sorted(weights)] for index, weights in enumerate(learner.model.coef_): assert_array_almost_equal(weights, feature_values[index]) assert_array_almost_equal(intercept, learner.model.intercept_) elif task == 'regression': lines_to_parse = [l for l in out.split('\n') if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values) else: lines_to_parse = [l for l in out.split('\n') if l] intercept_list = ast.literal_eval(lines_to_parse[0].split('=')[1].strip()) intercept = [] for intercept_string in intercept_list: intercept.append(safe_float(intercept_string)) feature_values = [] for ltp in lines_to_parse[1:]: fields = ltp.split('\t') feature_values.append((fields[1], safe_float(fields[0]))) feature_values = [t[1] for t in sorted(feature_values)] assert_array_almost_equal(intercept, learner.model.intercept_) assert_allclose(learner.model.coef_, feature_values)